In [1]:
#import sys
#!{sys.executable} -m spacy download en
#import nltk
#nltk.download('punkt')
#nltk.download('stopwords')

In [9]:
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import spacy
nlp = spacy.load('en', parse=True, tag=True, entity=True)

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

import joblib

In [2]:
df = pd.read_csv('review_sentiment.csv')
df.rename(columns = {'Text':'text', 'sentiment':'new_sentiment'}, inplace=True)
df = df[['Score','text','new_sentiment']]
df.dropna(inplace=True)
pd.crosstab(df.new_sentiment, 'freq', margins=True)

col_0,freq,All
new_sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,30178,30178
neutral,5002,5002
positive,113665,113665
All,148845,148845


In [3]:
def tokenize_tweet(tweet):
  text = word_tokenize(tweet)
  new_words= [word for word in text if word.isalpha()]
  return new_words

def lower_ph(tweet):
  text = tweet.lower()
  return text

def remove_stopwords(tweet):
  tweet = [word for word in tweet if not word in stopwords.words('english')]
  return tweet

def lemmatize_text(tweet):
    text = nlp(tweet)
    text = ''.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [5]:
%%time
clean_data = []

for text in df['text']:
    low = lower_ph(text)
    tok = tokenize_tweet(low)
    stop = remove_stopwords(tok)
    lemma = lemmatize_text(str(tok))
    clean_data.append(lemma)

CPU times: user 2h 13min 41s, sys: 17min 34s, total: 2h 31min 16s
Wall time: 2h 32min 29s


In [6]:
df['clean_tweet'] = clean_data
df.to_csv('review_clean.csv')

In [2]:
df = pd.read_csv('review_clean.csv')

### Machine Learning Model 

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_tweet'], df['new_sentiment'], test_size=0.2)

In [5]:
categories = df['new_sentiment'].unique().tolist()

In [4]:
encoder = LabelEncoder()
train_y_encoded = encoder.fit_transform(y_train)
test_y_encoded = encoder.fit_transform(y_test)

In [7]:
%%time
tfidf_vect = TfidfVectorizer(max_features=400)
tfidf_vect.fit(df['clean_tweet'])
train_x_tfidf = tfidf_vect.transform(X_train)
test_x_tfidf = tfidf_vect.transform(X_test)

CPU times: user 25 s, sys: 640 ms, total: 25.6 s
Wall time: 32.4 s


#### Naive Bayes 

In [7]:
%%time
naive = naive_bayes.MultinomialNB().fit(train_x_tfidf,y_train)
predictions_NB = naive.predict(test_x_tfidf)
print("Naive Bayes Accuracy Score: ",accuracy_score(predictions_NB, y_test)*100)

Naive Bayes Accuracy Score:  76.01531794820114
CPU times: user 482 ms, sys: 12.1 ms, total: 494 ms
Wall time: 510 ms


#### KNeighborsClassifier

In [8]:
%%time
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=4, weights='distance').fit(train_x_tfidf,y_train)
prediction = clf.predict(test_x_tfidf)
print("KNeighborsClassifier Acurracy Score: ",accuracy_score(y_test, prediction)*100)

KNeighborsClassifier Acurracy Score:  73.36826900466929
CPU times: user 4min 11s, sys: 37.9 s, total: 4min 49s
Wall time: 4min 54s


#### SVM

In [17]:
svm = Pipeline([('vect', CountVectorizer()), 
                         ('tfidf', TfidfTransformer(use_idf=False)), 
                         ('clf-svm', SGDClassifier(
                             loss='modified_huber', 
                             penalty='l2', 
                             alpha=0.0001, 
                             random_state=42, 
                             max_iter=20))])

In [18]:
svm = svm.fit(X_train, y_train)

In [19]:
predictions = svm.predict(X_test)
print("SVM Accuracy Score: ",accuracy_score(predictions, y_test)*100)

SVM Accuracy Score:  83.8053008162854


In [20]:
# saving the model
model = 'svm.sav'
joblib.dump(svm, model)

model = 'svm.joblib'
joblib.dump(svm, model)

['svm.joblib']