In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/TextFiles/moviereviews.tsv', sep='\t')
df.head()

In [None]:
len(df)

In [None]:
# look at the first review
df['review'][0]

In [None]:
# some empty reviews here
df.isnull().sum()

In [None]:
# remove empty (nan - not a number) values
# implace = True --> permanent drop
df.dropna(inplace=True)

In [None]:
len(df)

In [None]:
df.isnull().sum()

Often, in databases, there can be empty strings instead of just missing/nan/null values.

These should be removed, which can be done by several methods.

In [None]:
# one simple way to do this:
blanks = []

#iterate through dataframe's index, label, review text
for i,lb,rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)

In [None]:
# index positions of empty/blank reviews
blanks

In [None]:
df.drop(blanks, inplace=True)

In [None]:
len(df)

In [None]:
# split data
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [None]:
# build pipeline to vectorise data
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

text_clf = Pipeline([('tfidf', TfidfVectorizer()), 
                     ('svc', LinearSVC())])

In [None]:
# train and fit the model
text_clf.fit(X_train,y_train)

In [None]:
# perform predictions on test data to compare it with y-data
predictions = text_clf.predict(X_test)

In [None]:
# evaluate
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

print('Confusion matrix:')
print(confusion_matrix(y_test,predictions))
print('\n')
print('Classification report:')
print(classification_report(y_test,predictions))
print('\n')
print('Accuracy:')
print(accuracy_score(y_test,predictions))
