In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [4]:
df = pd.read_csv('../static/data/labeledTrainData.tsv', delimiter="\t")
df

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...
...,...,...,...
24995,3453_3,0,It seems like more consideration has gone into...
24996,5064_1,0,I don't believe they made this film. Completel...
24997,10905_3,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,10194_3,0,This 30 minute documentary Buñuel made in the ...


In [5]:
df['review'] = df['review'].str.replace('<br />',' ')
df['review'] = df.review.apply(lambda x: re.sub('[^a-zA-Z]',' ',x))

In [6]:
feature_df = df.drop(['id','sentiment'],axis=1,inplace=False)
X_train,X_test,y_train,y_test = train_test_split(
   feature_df,df.sentiment,test_size=0.3,random_state=156
)
X_train.shape,X_test.shape

((17500, 1), (7500, 1))

In [7]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])
params = ({
    'count_vect__max_df': [100, 300, 500],
    'lr_clf__C': [1, 5, 10]
})
grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3,
                         scoring='accuracy', verbose=1, n_jobs=-1)
grid_pipe.fit(X_train.review, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


In [5]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all',random_state=156)
test_news = fetch_20newsgroups(subset='test',random_state=156,
                                remove=('headers','footers','quotes'))
news_df = pd.DataFrame({'news':test_news.data,'target':test_news.target})
news_df.to_csv('../static/data/20news_test.csv')

In [14]:
df = pd.read_csv('../static/data/20news_test.csv',index_col='Unnamed: 0')
df['index']

KeyError: 'index'