In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from spacy.lang.fr import stop_words
from sklearn.linear_model import LogisticRegression
stop_words = list(stop_words.STOP_WORDS)
from sklearn.metrics import recall_score, accuracy_score
from sklearn.pipeline import Pipeline
from hyperopt import fmin

### Preprocessing 
We preprocess the data, especially the text, to extract features. For that, we are using TfidfVectorizer from sklearn to get a matrix of tfidf features for each text.  
We also the french stop words from spacy into the parameter of TfidfVectorizer

In [39]:
df_train = pd.read_csv("../../data/train.csv")
df_test = pd.read_csv("../../data/test.csv")
df_valid = pd.read_csv("../../data/valid.csv")

In [40]:
vectorizer = TfidfVectorizer(stop_words=stop_words)
X_train = vectorizer.fit_transform(df_train["review"])
Y_train = df_train["polarity"].to_numpy()

X_test = vectorizer.transform(df_test["review"])
Y_test = df_test["polarity"].to_numpy()

X_valid = vectorizer.transform(df_valid["review"])
Y_valid = df_valid["polarity"].to_numpy()



### Model 
We fit a simple logistic regression with the train data that we just made, coupled with the label from "polarity".  
For the scoring metrics, the most appropriate one for the task we are doing is the accuracy score because we need an evaluation on if the predictor is mistaken or not.

In [41]:
logReg = LogisticRegression(solver="liblinear").fit(X_train,Y_train)

In [None]:
test_accuracy_score = accuracy_score(logReg.predict(X_test), Y_test)
print("Accuracy score for Logistic Regression on test data :", test_accuracy_score)

Recall score for Logistic Regression on test data : 0.9098603038645865
Accuracy score for Logistic Regression on test data : 0.92235


### Pipeline Scikit-Learn
We make a pipelin with the pre-trained vectorizer and the Logistic Regression (that we refit again).

In [47]:
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words=stop_words)),
    ("logreg", LogisticRegression( solver="liblinear"))
])

pipe.fit(df_train["review"], df_train["polarity"])
y_pred = pipe.predict(df_test["review"])



In [48]:
test_accuracy_score = accuracy_score(pipe.predict(df_test["review"]), Y_test)
print("Accuracy score for Logistic Regression on test data :", test_accuracy_score)

Accuracy score for Logistic Regression on test data : 0.92235


### Hyper parameters optimisation

In [None]:
def objective(args):
    pass