In [1]:
"""
# SENTIMENT CLASSIFICATION PROJECT OVERVIEW

    In this project, a labelled test set is provided with text snippets labelled as either "Positive" or "Negative". 
    The model is trained based on this input data to predict the sentiments of the text snippets in the test set. The 
    test set provided has numeric ids and text snippets, which is used for predicting the outcome. The output will 
    contain the id and predictions (positive or negative).

### Participants
    Baburaj Velayudhan
    bvelayudhan@vmware.com
    CMBU

### Summary   
    The input text data is cleaned to remove any @mentions, URLs, hashtags, punctuations, html etc and a train-test split 
    is created for validating the trained model. Using a wordcloud, the most common words in positive and negative 
    sentiments are identified. There are many common words in positive and negative sentiments.

    I have used MultinomialNB with CountVectorizer and TfidfVectorizer. To model accuracy is compared with a LogisticRegression
    model with CountVectorizer and TfidfVectorizer. Based on my trials, i have selected MultinomialNB trigram model for 
    final predictions.
   
### Feature Selection
    For feature selection, chi2 with SelectKBest with chi2 is used. The chi2 score is highest with 1707 features at 76.57% 
    accuracy for trigrams using Tfidf vectorizer.
   
### Training methodology
    MultinomialNB with CountVectorizer and TfidfVectorizer:
   
    As a first step, CountVectorizer is used with MulinomialNB classifier on unigrams, bigrams and trigrams. The model 
    accuracy is computed for feature count ranging from 1 to 10000 with and without Words. A trigram model showed high 
    accuracy with stop words. A classification report for bigram and trigrams indicate almost same accuracy of 75.53 % 
    and f1- score of 70 for Negative and 79 for Positive cases. The process is repeated with TfidfVectorizer and 
    MultinomialNB and observed that a trigram model wth tfidf had 76.58% accuracy with 1901 features
   
    LogisticRegression with CountVectorizer and TfidfVectorizer:
       The same process is repeated with LogisticRegression model and it shows that the bi-gram and tri-gram accuracy drops 
       from the previous trials.

    With the above, MultinomialNB was chosen with Tfidf maximum features of 1901 and a SelectKBest showed 76.57% accuracy 
    with 1707 features. The classification report showed 73% accuracy for negative and 79% accuracy for positive cases. 
    Hence this was chosen for predictions.

### Notable aspects
    Many terms were common in positive and negative sentiments. I believe, if such words were filtered out, the model accuracy
    would have improved a bit more.
    
### References
    The data cleaning and model training approach is highly infiuenced by the Ricky Kim's article published in 
    https://towardsdatascience.com/another-twitter-sentiment-analysis-bb5b01ebad90
   
### Assumptions
    (1) The input data file is located in the current folder and filename is Export_loop-sentiment-pos-neg-train_05112020000000.csv
    (b) The test data file is located in the current folder and the file name is sentiment-eval.csv
    (c) The final predictions will be located in the current folder and the file name is preditions.csv
        
### python and library versions
        Python        : 3.7.6
        sklearn       : 0.22.1
        numpy         : 1.18.1
        seaborn       : 0.10.0
        Beautiful Soup: 4.8.2
        Matplotlib    : 3.1.3
        pandas        : 1.0.1
        wordcloud     : 1.7.0
        NLTK          : 3.4.5

"""


'\n# SENTIMENT CLASSIFICATION PROJECT OVERVIEW\n\n    In this project, a labelled test set is provided with text snippets labelled as either "Positive" or "Negative". \n    The model is trained based on this input data to predict the sentiments of the text snippets in the test set. The \n    test set provided has numeric ids and text snippets, which is used for predicting the outcome. The output will \n    contain the id and predictions (positive or negative).\n\n### Participants\n    Baburaj Velayudhan\n    bvelayudhan@vmware.com\n    CMBU\n\n### Summary   \n    The input text data is cleaned to remove any @mentions, URLs, hashtags, punctuations, html etc and a train-test split \n    is created for validating the trained model. Using a wordcloud, the most common words in positive and negative \n    sentiments are identified. There are many common words in positive and negative sentiments.\n\n    I have used MultinomialNB with CountVectorizer and TfidfVectorizer. To model accuracy is c

In [2]:
# import libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction import text 
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import chi2,SelectKBest
from bs4 import BeautifulSoup
from wordcloud import WordCloud
from nltk.tokenize import WordPunctTokenizer

import pandas as pd
import re
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

plt.style.use('fivethirtyeight')
%matplotlib inline

In [3]:
# load the input data from "data" subfolder

FILE_NAME = "./Export_loop-sentiment-pos-neg-train_05112020000000.csv"
data = pd.read_csv(FILE_NAME)

In [4]:
#get some insights 
print("Data set shape")
print(data.shape)

Data set shape
(1900, 2)


In [5]:
# get some insights
print("Overall info on Data set")
data.info()

Overall info on Data set
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1900 entries, 0 to 1899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   1900 non-null   object
 1   text    1900 non-null   object
dtypes: object(2)
memory usage: 29.8+ KB


In [6]:
# get distinct values in label
print("Unique value of labels in Data set")
data['label'].unique()

Unique value of labels in Data set


array(['Negative', 'Positive'], dtype=object)

In [7]:
#get the counts of labels in the set
print("Count of distinct label values")
data['label'].value_counts()

Count of distinct label values


Positive    1013
Negative     887
Name: label, dtype: int64

In [8]:
# The data set has no null values and have 1900 rows in total.
# distinct values in label are 'positive' and 'negative'

# Out of 1900, 1013 are positive and 887 are negative,
# this shows that the data is not skewed to positive or negative sentiment

# The 'text' neeeds some cleaning to remove any @mentions
# urls, punctuations, white spaces, unicode bytemarks
# hashtags, numbers etc.

# after cleaning the dataset, do a train-test split to 
# validate the accuracy of the models. 


In [9]:
# clean the data for remove any
# @ mentions
# html
# puntuations
# urls
# numbers
# hashtags
# unicode bytemarks
# 

token = WordPunctTokenizer()
at = r'@[A-Za-z0-9_]+'
url = r'https?://[^ ]+'
combined = r'|'.join((at, url))
www_pat = r'www.[^ ]+'

# negations will lose meaning if the "'" is removed as part of cleaning. so 
# make a dictionary with the common negations and replace the dictionary values 
# if any dictionary key is found in text

negations_dic = {"isn't":"is not", 
                "aren't":"are not", 
                "wasn't":"was not", 
                "weren't":"were not",
                "haven't":"have not",
                "hasn't":"has not",
                "hadn't":"had not",
                "won't":"will not",
                "wouldn't":"would not", 
                "don't":"do not", 
                "doesn't":"does not",
                "didn't":"did not",
                "can't":"can not",
                "couldn't":"could not",
                "shouldn't":"should not",
                "mightn't":"might not",
                "mustn't":"must not", 
                "you're": "you are", 
                "you'll":"you will",
                "we'll":"we will", 
                "we've": "we have", 
                "you've": "you have",
                "i'm" : "i am"}

neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')

def clean_text(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined, '', souped)
    
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    
    # remove url pattens, change to lower case, replace negation words with dictionary values,
    # replace anything except alphabets and remove extra spaces
    
    clean = re.sub(combined, '', clean)
    clean = re.sub(www_pat, '', clean)
    clean = clean.lower()
    clean = neg_pattern.sub(lambda x: negations_dic[x.group()], clean)
    clean = re.sub("[^a-zA-Z]", " ", clean)
    
    words = [x for x  in token.tokenize(clean) if len(x) > 1]
    return (" ".join(words)).strip()

data['text'] = data['text'].apply(clean_text)

data['label'] = data['label'].apply(lambda x : x.lower())

In [10]:
print("Cleaned Data")
data.head()

Cleaned Data


Unnamed: 0,label,text
0,negative,no one cares about marketing slides technical ...
1,positive,are all three hosts providing storage capacity...
2,negative,would loved to had managed to get down to the ...
3,negative,vending machine at work is out of dasani water...
4,positive,rt paul maritz ceo and president of vmware is ...


In [None]:
# Get a word cloud representation for positive and negative comments
# and get a word frequency for positive and negatve comments.

print("WordCloud representation for negative sentiments")

negative = data[data.label =="negative"]

neg_string = []
for t in negative.text:
    neg_string.append(t)
    
neg_string = pd.Series(neg_string).str.cat(sep=' ')

wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(neg_string)
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

WordCloud representation for negative sentiments


In [None]:
print("WordCloud representation for positive sentiments")

positive = data[data.label =="positive"]

pos_string = []
for t in positive.text:
    pos_string.append(t)
    
pos_string = pd.Series(pos_string).str.cat(sep=' ')

wordcloud = WordCloud(width=1600, height=800,max_font_size=200).generate(pos_string)

plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()


In [None]:
print("Some of the words appear in both the positive and negative sentiments. for example, vmware, work, host etc.")
print("Words such as  still, got, work, issue,day etc appear in negative sentiments a lot")

In [None]:
print(" The words such as just, like, vmware, does, need etc appear in both the positive and negative sentiments")

In [None]:
# do a split for testing the model accuracy.
# reserve 20% as test data. 

X = data["text"]
y = data["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size = 0.2, stratify=y)

X_train_DF = pd.concat([X_train, y_train], axis = 1)
X_test_DF = pd.concat([X_test, y_test], axis = 1)

train_shape = X_train_DF.shape
train_row, train_col = (train_shape)

test_shape = X_test_DF.shape
test_row, test_col = (test_shape)
print("There are {train_row} rows in training set and {test_row} rows in test set".format(train_row=train_row,test_row=test_row))

print("Count of distinct label values in train data")
print(X_train_DF['label'].value_counts())

print("Count of distinct label values in test data")
print(X_test_DF['label'].value_counts())

In [None]:
# Now start with CountVectorizer with stop word elimination and
# get a prediction done with MulinomialNB classifier

#CountVectorizer with stop word elimination
cvect = CountVectorizer(stop_words='english')

X_train_dtm = cvect.fit_transform(X_train)
X_test_dtm = cvect.transform(X_test)

# use Naive Bayes classifier
mnb = MultinomialNB()
mnb.fit(X_train_dtm, y_train)
y_pred_class = mnb.predict(X_test_dtm)

# calculate accuracy without stop words
accur_wo_sw = mnb.score(X_test_dtm,y_test)

In [None]:
print("The model accuracy without stop words is {accuracy_score} ".format(accuracy_score=accur_wo_sw *100))

In [None]:
# get the null accuracy

y_test_num = y_test.to_frame()

y_test_num['label'] =  y_test_num['label'].astype("category")
y_test_num['labelNum'] =  y_test_num['label'].cat.codes

null_accuracy = max(y_test_num['labelNum'].mean(),1-y_test_num['labelNum'].mean())
print("The null accuracy is {null_accuracy}".format(null_accuracy=null_accuracy*100))


In [None]:
# fit the model without stop word elimination

cvect = CountVectorizer()
X_train_dtm = cvect.fit_transform(X_train)
X_test_dtm = cvect.transform(X_test)

# use Naive Bayes to predict the star rating
mnb = MultinomialNB()
mnb.fit(X_train_dtm, y_train)
y_pred_class = mnb.predict(X_test_dtm)

# calculate accuracy
accur_with_sw =mnb.score(X_test_dtm,y_test)
print("The model accuracy with stop words is {accuracy_score} and the null accuracy is {null_accuracy}".format(accuracy_score=accur_with_sw*100,null_accuracy=null_accuracy*100))

In [None]:
# Here, we see that with with stop words, the model has a better accuracy than the one without stop words
# Now identify the ideal number of features by iterating over a number of features 
# and visualize accuracy in a chart.

def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return accuracy

def nfeature_accuracy_checker(n_features, stop_words, ngram_range, classifier, vectorizer):
    result = []
    
    accuracy_score = 0
    feature_count = 0
    for n in n_features:
        vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
        
        nfeature_accuracy = accuracy_summary(checker_pipeline, X_train, y_train, X_test, y_test)
        
        if nfeature_accuracy > accuracy_score:
            accuracy_score = nfeature_accuracy
            feature_count = n
            
        result.append((n,nfeature_accuracy))
    
    print ("Validation result for {} features has the highest accuracy, {}".format(feature_count, accuracy_score* 100))
    return result


cvec = CountVectorizer()
mnb = MultinomialNB()
n_features = np.arange(1,10000,100)


#get the unigram MultinomialNB accuracy for features ranging from 1 to 10000
print ("RESULT FOR UNIGRAM WITH STOP WORDS\n")
feature_accuracy_sw = nfeature_accuracy_checker(vectorizer=cvec, classifier=mnb, ngram_range=(1,1), n_features = n_features, stop_words= None)

print("-------------------------------")
print("")
print ("RESULT FOR UNIGRAM WITHOUT STOP WORDS\n")
feature_accuracy_wo_sw = nfeature_accuracy_checker(vectorizer=cvec, classifier=mnb,stop_words='english',ngram_range=(1,1), n_features = n_features)


#DF of unigram model wit stop words
feature_ug_sw_DF = pd.DataFrame(feature_accuracy_sw, columns = ['feature_count','model_accuracy'])

#DF of unigram model without stop words
feature_ug_wo_sw_DF = pd.DataFrame(feature_accuracy_wo_sw, columns = ['feature_count','model_accuracy'])

plt.figure(figsize=(12,12))
plt.plot(feature_ug_sw_DF.feature_count, feature_ug_sw_DF.model_accuracy, label='with stop words')
plt.plot(feature_ug_wo_sw_DF.feature_count, feature_ug_wo_sw_DF.model_accuracy,label='without stop words')
plt.title("Without stop words VS With stop words (Unigram): Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()

In [None]:
print("The plot indicates that ")
print("          The unigram model accuracy is high with stop words")
print("          The accuracy is 75.26% stop words for 1101 features")
print("          The accuracy is 72.36%  without stop words for 3301 features")



In [None]:
# Now get the accuracy for bi-gram and tri-gram models with and without stop words

# set an initial number of features to 100000
n_features = np.arange(1,100000,100)

print ("RESULT FOR BIGRAM WITH STOP WORDS")
accuracy_sw_bi_gram = nfeature_accuracy_checker(stop_words=None,vectorizer=cvec, classifier=mnb,n_features=n_features,ngram_range=(1, 2))
print("-------------------------------\n")

print ("RESULT FOR TRIGRAM WITH STOP WORDS")
accuracy_sw_tri_gram = nfeature_accuracy_checker(stop_words=None,vectorizer=cvec, classifier=mnb,n_features=n_features,ngram_range=(1, 3))
print("-------------------------------\n")

print ("RESULT FOR BIGRAM WITHOUT STOP WORDS")
accuracy_wo_sw_bi_gram = nfeature_accuracy_checker(stop_words='english',vectorizer=cvec, classifier=mnb,n_features=n_features,ngram_range=(1, 2))
print("-------------------------------\n")

print ("RESULT FOR TRIGRAM WITHOUT STOP WORDS")
accuracy_wo_sw_tri_gram = nfeature_accuracy_checker(stop_words='english',vectorizer=cvec, classifier=mnb,n_features=n_features,ngram_range=(1, 3))
print("-------------------------------\n")


In [None]:
#plot the model accuracies 

#DF of bigram model with stop words
sw_bi_gram_DF = pd.DataFrame(accuracy_sw_bi_gram, columns = ['feature_count','model_accuracy'])

#DF of trigram model with stop words
sw_tri_gram_DF = pd.DataFrame(accuracy_sw_tri_gram, columns = ['feature_count','model_accuracy'])

#DF of bigram model without stop words
wo_sw_bi_gram_DF = pd.DataFrame(accuracy_wo_sw_bi_gram, columns = ['feature_count','model_accuracy'])

#DF of trigram model without stop words
wo_sw_tri_gram_DF = pd.DataFrame(accuracy_wo_sw_tri_gram, columns = ['feature_count','model_accuracy'])

plt.figure(figsize=(12,12))
plt.plot(sw_bi_gram_DF.feature_count, sw_bi_gram_DF.model_accuracy, label='bi-gram with stop words')
plt.plot(sw_tri_gram_DF.feature_count, sw_tri_gram_DF.model_accuracy,label='tri-gram with stop words')
plt.plot(wo_sw_bi_gram_DF.feature_count, wo_sw_bi_gram_DF.model_accuracy,label='bi-gram without stop words')
plt.plot(wo_sw_tri_gram_DF.feature_count, wo_sw_tri_gram_DF.model_accuracy,label='tri-gram without stop words')


plt.title("Bi-gram VS Tri-gram: Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()

In [None]:
print("The plot indicates that tri-gram model with stopwords has the highest accuracy at 75.526% for 9901 features")
print("Though the bi-gram model with stopwords has the same accuracy, the number of features are 10701")      
print("Now, get the confusion matrix and classification report for bi-gram and tri-gram model for 10701 and 9901 features respectievely")


def get_classif_report(pipeline, x_train, y_train, x_test, y_test):
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    print("Accuracy = {accuracy_score}".format(accuracy_score=accuracy))
    print (metrics.classification_report(y_test, y_pred, target_names=['negative','positive']))
    
print("Bigram Classification Report")
bigram_CV = CountVectorizer(max_features=10701,ngram_range=(1, 2))
pipeline = Pipeline([
        ('vectorizer', bigram_CV),
        ('classifier', mnb)
    ])

get_classif_report(pipeline, X_train, y_train, X_test, y_test)

print("-------------------------\n")
print("Trigram Classification Report")
trigram_CV = CountVectorizer(max_features=9901,ngram_range=(1, 3))
pipeline = Pipeline([
        ('vectorizer', trigram_CV),
        ('classifier', mnb)
    ])

get_classif_report(pipeline, X_train, y_train, X_test, y_test)



In [None]:
print("Using CountVectorizer, the tri-gram model with stop words and 9901 features has same accuracy as bigram model without stop words and 10701 features")
print("Both the models have f1 score of 70% for negative and 79% for positive cases")

print("Now ,try a TFIdf Vectorizer and compare the accuracy to see if the accuracy is better with or without stopwords")

In [None]:
tvec = TfidfVectorizer()
mnb = MultinomialNB()

n_features = np.arange(1,10000,1000)

print("\n")
print ("RESULT FOR UNIGRAM WITH STOP WORDS (TFIDF)")
tfidf_ug_sw = nfeature_accuracy_checker(vectorizer=tvec, classifier=mnb, n_features=n_features, stop_words= None, ngram_range=(1,1))
print("-------------------------------\n")

print ("RESULT FOR UNIGRAM WITHOUT STOP WORDS (TFIDF) \n")
tfidf_ug_wo_sw = nfeature_accuracy_checker(vectorizer=tvec, classifier=mnb, n_features=n_features, stop_words= 'english', ngram_range=(1,1))
print("-------------------------------\n")

#DF of unigram model without stop words
tfidf_ug_sw_DF = pd.DataFrame(tfidf_ug_sw, columns = ['feature_count','model_accuracy'])

#DF of unigram model with english stop words
tfidf_ug_wo_sw_DF = pd.DataFrame(tfidf_ug_wo_sw, columns = ['feature_count','model_accuracy'])

plt.figure(figsize=(12,12))
plt.plot(tfidf_ug_sw_DF.feature_count, tfidf_ug_sw_DF.model_accuracy, label='with stop words')
plt.plot(tfidf_ug_wo_sw_DF.feature_count, tfidf_ug_wo_sw_DF.model_accuracy,label='without stop words')
plt.title("TFIDF Without stop words VS With stop words (Unigram): Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()


In [None]:
print( "With TfidfVectorizer, the unigram model accuracy is high without stop words (73.94%) with 3001 features")
print( "which is less than what we achieved (75%)with countVectorizer unigram")

print(" Now try bigram and trigram model with TFIDF")


# Now get the accuracy for bi-gram and tri-gram models with and without stop words

# set an initial number of features to 100000
n_features = np.arange(1,100000,100)

print ("RESULT FOR BIGRAM WITH STOP WORDS")
tfidf_accuracy_sw_bi_gram = nfeature_accuracy_checker(stop_words=None,vectorizer=tvec, classifier=mnb,n_features=n_features,ngram_range=(1, 2))
print("-------------------------------\n")

print ("RESULT FOR TRIGRAM WITH STOP WORDS")
tfidf_accuracy_sw_tri_gram = nfeature_accuracy_checker(stop_words=None,vectorizer=tvec, classifier=mnb,n_features=n_features,ngram_range=(1, 3))
print("-------------------------------\n")

print ("RESULT FOR BIGRAM WITHOUT STOP WORDS")
tfidf_accuracy_wo_sw_bi_gram = nfeature_accuracy_checker(stop_words='english',vectorizer=tvec, classifier=mnb,n_features=n_features,ngram_range=(1, 2))
print("-------------------------------\n")

print ("RESULT FOR TRIGRAM WITHOUT STOP WORDS")
tfidf_accuracy_wo_sw_tri_gram = nfeature_accuracy_checker(stop_words='english',vectorizer=tvec, classifier=mnb,n_features=n_features,ngram_range=(1, 3))
print("-------------------------------\n")

#DF of bigram model with stop words
tfidf_sw_bi_gram_DF = pd.DataFrame(tfidf_accuracy_sw_bi_gram, columns = ['feature_count','model_accuracy'])

#DF of trigram model with stop words
tfidf_sw_tri_gram_DF = pd.DataFrame(tfidf_accuracy_sw_tri_gram, columns = ['feature_count','model_accuracy'])

#DF of bigram model without stop words
tfidf_wo_sw_bi_gram_DF = pd.DataFrame(tfidf_accuracy_wo_sw_bi_gram, columns = ['feature_count','model_accuracy'])

#DF of trigram model without stop words
tfidf_wo_sw_tri_gram_DF = pd.DataFrame(tfidf_accuracy_wo_sw_tri_gram, columns = ['feature_count','model_accuracy'])

plt.figure(figsize=(12,12))
plt.plot(tfidf_sw_bi_gram_DF.feature_count, tfidf_sw_bi_gram_DF.model_accuracy, label='bi-gram with stop words')
plt.plot(tfidf_sw_tri_gram_DF.feature_count, tfidf_sw_tri_gram_DF.model_accuracy,label='tri-gram with stop words')
plt.plot(tfidf_wo_sw_bi_gram_DF.feature_count, tfidf_wo_sw_bi_gram_DF.model_accuracy,label='bi-gram without stop words')
plt.plot(tfidf_wo_sw_tri_gram_DF.feature_count, tfidf_wo_sw_tri_gram_DF.model_accuracy,label='tri-gram without stop words')


plt.title("TFIDF Bi-gram VS Tri-gram: Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()


In [None]:
print("With TFIDF, a MultinomialNB bigram model with stop words gave accuracy of 76.05 with 2301 features")
print("With TFIDF, a MultinomialNB trigram model with stop words gave accuracy of 76.578 with 1901 features")
print("With TFIDF, a MultinomialNB trigram model without stop words gave accuracy of 71.842 with 3001 features")
print("With TFIDF, a MultinomialNB trigram model without stop words gave accuracy of 71.842 with 2901 features")

print("Try a LogisticRegression model with TFIDF to see if the accuracy improves")

tvec = TfidfVectorizer()
lr = LogisticRegression()

n_features = np.arange(1,10000,100)

print("RESULT FOR UNIGRAM WITH STOP WORDS (TFIDF)")
tfidf_ug_lr_sw = nfeature_accuracy_checker(classifier=lr,vectorizer=tvec,stop_words=None,n_features=n_features,ngram_range=(1, 1))
print("-------------------------------\n")

print("RESULT FOR UNIGRAM WITHOUT STOP WORDS (TFIDF)")
tfidf_ug_lr_wo_sw = nfeature_accuracy_checker(classifier=lr,vectorizer=tvec,stop_words='english',n_features=n_features,ngram_range=(1, 1))
print("-------------------------------\n")

#DF of unigram model without stop words
tfidf_ug_lr_sw_DF = pd.DataFrame(tfidf_ug_lr_sw, columns = ['feature_count','model_accuracy'])

#DF of unigram model with english stop words
tfidf_ug_lr_wo_sw_DF = pd.DataFrame(tfidf_ug_lr_wo_sw, columns = ['feature_count','model_accuracy'])

plt.figure(figsize=(12,12))
plt.plot(tfidf_ug_lr_sw_DF.feature_count, tfidf_ug_lr_sw_DF.model_accuracy, label='with stop words')
plt.plot(tfidf_ug_lr_wo_sw_DF.feature_count, tfidf_ug_lr_wo_sw_DF.model_accuracy,label='without stop words')
plt.title("LogisticRegression Model Without stop words VS With stop words (Unigram): Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()





In [None]:
#Now try bigram and trigram model LogisticRegression to see if we get a better accuracy

n_features = np.arange(1,100000,10000)


print ("RESULT FOR LOGISTIC REGRESSION BIGRAM WITH STOP WORDS\n")
tfidf_lr_bigram_sw = nfeature_accuracy_checker(classifier=lr,vectorizer=tvec,stop_words=None,n_features=n_features,ngram_range=(1, 2))
print("-------------------------------\n")

print ("RESULT FOR LOGISTIC REGRESSION BIGRAM WITHOUT STOP WORDS\n")
tfidf_lr_bigram_wo_sw = nfeature_accuracy_checker(classifier=lr,vectorizer=tvec,stop_words='english',n_features=n_features,ngram_range=(1, 2))
print("-------------------------------\n")

                                                   
print ("RESULT FOR LOGISTIC REGRESSION TRIGRAM WITH STOP WORDS\n")
tfidf_lr_trigram_sw = nfeature_accuracy_checker(classifier=lr,vectorizer=tvec,stop_words=None,n_features=n_features,ngram_range=(1, 3))
print("-------------------------------\n")

print ("RESULT FOR LOGISTIC REGRESSION TRIGRAM WITH STOP WORDS\n")
tfidf_lr_trigram_wo_sw = nfeature_accuracy_checker(classifier=lr,vectorizer=tvec,stop_words='english',n_features=n_features,ngram_range=(1, 3))
print("-------------------------------\n")

#DF of bigram model with stop words
tfidf_lr_bigram_sw_DF = pd.DataFrame(tfidf_lr_bigram_sw, columns = ['feature_count','model_accuracy'])

#DF of bigram model without stop words
tfidf_lr_bigram_wo_sw_DF = pd.DataFrame(tfidf_lr_bigram_wo_sw, columns = ['feature_count','model_accuracy'])

#DF of trigram model with stop words
tfidf_lr_trigram_sw_DF = pd.DataFrame(tfidf_lr_trigram_sw, columns = ['feature_count','model_accuracy'])

#DF of trigram model without stop words
tfidf_lr_trigram_wo_sw_DF = pd.DataFrame(tfidf_lr_trigram_wo_sw, columns = ['feature_count','model_accuracy'])

plt.figure(figsize=(12,12))
plt.plot(tfidf_lr_bigram_sw_DF.feature_count, tfidf_lr_bigram_sw_DF.model_accuracy, label='bi-gram with stop words')
plt.plot(tfidf_lr_bigram_wo_sw_DF.feature_count, tfidf_lr_bigram_wo_sw_DF.model_accuracy,label='bi-gram without stop words')
plt.plot(tfidf_lr_trigram_sw_DF.feature_count, tfidf_lr_trigram_sw_DF.model_accuracy, label='tri-gram with stop words')
plt.plot(tfidf_lr_trigram_wo_sw_DF.feature_count, tfidf_lr_trigram_wo_sw_DF.model_accuracy,label='tri-gram without stop words')

plt.title("LogisticRegression Model Bigram and Trigram: Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")
plt.legend()

In [None]:
print(" The accuracy of LR unigram is 76.84% with 801 features")
print(" The accuracy of LR trigram is 74.47% with 10001 features")
print(" The accuracy is less than that of MultinomialNB trigam, which is 76.578 with 1901 features")

print(" Lets use MultinomialNB trigram model with max features of 1901 and use Chi2 to pick the most relevant features")

In [None]:
# now use chi2 method to get the most relevant features out of the 1901
tfidf = TfidfVectorizer(max_features = 1901, ngram_range = (1,3))

x_train_tfidf = tfidf.fit_transform(X_train)
x_test_tfidf = tfidf.transform(X_test)

#get the chi2 values
chisq = chi2(x_train_tfidf, y_train)[0]

#to see the top features based on chi2 values, combine feature names and associated chi2 values and 
#create a data frame and sort on chi2 desceding order.
feature_DF = pd.DataFrame({"Feature_Name": tfidf.get_feature_names(), "Chisq_Value" : chisq.tolist()})


feature_DF.sort_values( by= "Chisq_Value", ascending= False)


In [None]:
# now pick the best features 

high_score = 0
feature_count = 0

ch2_result = []
for n in np.arange(1,1901,1):
    ch2 = SelectKBest(chi2, k=n)
    x_train_k_best = ch2.fit_transform(x_train_tfidf, y_train)
    x_test_k_best = ch2.transform(x_test_tfidf)
    clf = MultinomialNB()
    clf.fit(x_train_k_best, y_train)
    score = clf.score(x_test_k_best, y_test)
    if score > high_score:
        high_score = score
        feature_count = n
    ch2_result.append(score)
    # print ("chi2 feature selection evaluation calculated for {} features and the score is {}".format(n,score))
       
#print("The K Best Features are selected and the Feature count is 7001 with score of 75%")
print("The high score is {} with feature count {}".format(high_score* 100,feature_count))



In [None]:
#The final Model

ch2 = SelectKBest(chi2, k=1707)
x_train_k_best = ch2.fit_transform(x_train_tfidf, y_train)
x_test_k_best = ch2.transform(x_test_tfidf)
clf = MultinomialNB()
clf.fit(x_train_k_best, y_train)
score = clf.score(x_test_k_best, y_test)
print("Accuracy score of the final model is {}".format(score*100))    


y_pred = clf.predict(x_test_k_best)
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy = {accuracy_score}".format(accuracy_score=accuracy*100))
nt (metrics.classification_report(y_test, y_pred, target_names=['negative','positive']))

print("\n The f1-score is 73% for negative and 79% for positive")


In [None]:
# Read the evaluation set from data subfolder

FILE_NAME = "./sentiment-eval.csv"
eval_data = pd.read_csv(FILE_NAME, header=0)

# do the data cleaning on eval data set 
eval_data['text'] = eval_data['text'].apply(clean_text)

#TFIDF transform eval data
x_eval_tfidf = tfidf.transform(eval_data['text'])
x_eval_k_best = ch2.transform(x_eval_tfidf)

#predict 
predictions = clf.predict(x_eval_k_best)


pred_df = pd.DataFrame(predictions, columns=['result'])

eval_data['label'] = pred_df['result'].values

eval_data = eval_data.drop('text',axis = 1)


In [None]:
# export to csv file

eval_data.to_csv('./predictions.csv', index=False)