<a href="https://colab.research.google.com/github/atharvajpatel/AI-Hackathon-Ignition/blob/master/Final_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Importing NumPy library.
import numpy as np

#Importing Pandas library to manage data frames.
import pandas as pd

#Importing TfidfVectorizer from Sci-Kit Learn to vectorize our data set.
from sklearn.feature_extraction.text import TfidfVectorizer

#Importing Logistic Regression from Sci-Kit Learn as our sentiment analysis classifier/regressor.
from sklearn.linear_model import LogisticRegression

#Importing f1_score from Sci-Kit Learn to find false negatives and false positives, which .score does not provide.
from sklearn.metrics import f1_score

#Importing train_test_split from Sci-Kit Learn to split our data set into training data and testing data.
from sklearn.model_selection import train_test_split

#Importing GridSearch CV from Sci-Kit Learn to loop through predefined hyperparameters.
from sklearn.model_selection import GridSearchCV

#Importing files from google.colab to be able to import csv files.
from google.colab import files

#Importing io to help with uploading files.
import io

#importing nltk and then downloading stopwords.
import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords

#Making a list of stop words using the stop words listed in nltk
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
#Creating review class to get the sentiment
class Review:

  #Constructor of review class to set score and text.
  def __init__(self, text, score):
    self.text = text
    self.score = score

  #Method getSentiment is to get the sentiment based on whether it is a 1 or a 0.
  def getSentiment(self):
    if self.score == 1:
      return "Positive"
    else:
      return "Negative"

In [None]:
#Uploading csv file to data frame.
data = files.upload()
df = pd.read_csv(io.BytesIO(data['training_data.csv']))


Saving training_data.csv to training_data.csv


In [None]:
#Data Cleaning

#Changing all letters to lowercase so that we can remove the stopwords.
df['Clean Data'] = df['Text'].apply(lambda x: " ".join(y.lower() for y in x.split()))

#Removing the @mentions because they are unnecessary for sentiment analysis.
df['Clean Data'] = df['Clean Data'].str.replace('([@])\w+', '')

#Removing punctuation because they do not affect sentiment analysis
df['Clean Data'] = df['Clean Data'].str.replace('[^\w\s]', '')

#Removing stopwords from data set.
df['Clean Data'] = df['Clean Data'].apply(lambda x: ' '.join(y for y in x.split() if y not in stop_words))

#Looking at the top 30 most common words.
pd.Series(' '.join(df['Clean Data']).split()).value_counts()[:30]


im        111484
good       55981
day        51443
get        51097
like       48574
go         45500
dont       42064
today      40277
going      40100
love       39369
cant       39353
work       39282
got        37948
back       35082
time       35003
lol        34429
one        32897
u          32839
know       31801
really     30999
see        28795
well       27743
still      26724
want       26319
new        26312
think      25667
night      25543
amp        25289
thanks     24522
home       24506
dtype: int64

In [None]:
#Creating list of new stopwords from the list of common words from the previous cell.

#This words in the list are then removed from the data set.
new_stopwords = ['im', 'got', 'get', 'dont', 'going', 'cant', 'got', 'today']
df['Clean Data'] = df['Clean Data'].apply(lambda x: " ".join(y for y in x.split() if y not in new_stopwords))

In [None]:
#Parsing the data for all the important stuff. 1D list with the review objects for each row
review = []
#The for loop syntax that goes through the entire data set, index being each row number, and rows being all the data in each row
for index, rows in df.iterrows():
  #Rather than creating 2D list, have a list of objects where we can access their properties polymorphically
  review.append(Review(df.iloc[index, 4], df.iloc[index,3]))

In [None]:
#Use train_test_split method to seperate and filter the data into training and testing. 
train, test = train_test_split(review, test_size=0.25,random_state=1)

In [None]:
#Create a vectorizer to utilize bag of words model and then to convert the text to numbers 

tfv=TfidfVectorizer()

In [None]:
#Split the train values into trainX and trainY so that we can vectorize then fit

#As review and train contain objects (polymorphic array), call their "text" field and getSentiment() method to get those niche values into each individual variable 
trainX = [x.text for x in train]
trainY = [x.getSentiment() for x in train]

#Same thing here
testX = [x.text for x in test]
testY = [x.getSentiment() for x in test]

#Create sparse trainX_vector array to get vectorized values using fit_transform to use in the fit method 
trainX_vector = tfv.fit_transform(trainX)

#Create sparse testX_vector array to get vectorized values. As we don't need to fit this data (we use it in the .score) we use just .transform()
testX_vector = tfv.transform(testX)

In [None]:
#A dictionary to store all the values that the grid search will be using to hyper tune

'''
param_grid explanation of each value = {

  'PENALTY':
  Penalty is a logistic regression parameter that allows us to prevent overfitting and allowing the model to better predict unseen 
  data. The key difference between these parameters is that L1 shrinks the less important feature’s coefficient to zero thus, removing some 
  feature altogether. So, this works well for feature selection in case we have a huge number of features.  ,
  
  'C': 
  The C value is the inverse regularization strength. Basically the lower the C value, the higher strength of regularization, which mentioned as 
  above prevents overfitting. Since we have such a large dataset, we use really low C values because we want the regularization to be stronger. 
  However when working with smaller data sets, it should be noted that it is important to not overcompensate for overfitting so you should use a
  larger C value in that case

  'SOLVER':
  A solver is basically a graphical parameter that we use in our logistic regression model. We use the solvers 'newton-cg', 'liblinear', and 'lbfgs'.
  Each of these uses different methods to find the global optima, which is the minimum value of the cost or loss function. A graph of the 
  cost or loss function would show us a line that runs through all the parameters in a hypothesis (can be anything from a simple one variable 
  equation to a lengthy multivariate equation) that are used to minimize the error in predicting the output. The cost or loss function can be 
  represented by either a smooth line and distinguishable shape, or a bumpy line and a non-distinguishable shape. The global optima is sometimes hard
  to predict using a function because of the shape and size of the cost or loss function. That is where the solvers come in. The default solver, 
  'sag' is optimized for smaller data sets, but the other three that we used are more suitable for our specific data set.

}
'''

param_grid =  {
    'penalty' : ['l1', 'l2'],
    'C': [0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 2],
    'solver' : ['lbfgs','newton-cg','liblinear'],
}

'''
Using the GridSearchCV method to hyper tune. Use the logistic regression classifier and try all combinations and permutations of dictionary param
to find the best variable values to use for each data point. Pass cv as 4 (cross validation fold) to make sure all the data is accurately used. Pass
in verbose to display all the runtime data.
'''
grid = GridSearchCV(LogisticRegression(max_iter= len(trainX)+1), param_grid, cv=4, verbose = 5)

#The fit method which actually trains the model to recognize sentiments
grid.fit(trainX_vector, trainY)


Fitting 4 folds for each of 54 candidates, totalling 216 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s


[CV] C=1e-06, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=1e-06, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=1e-06, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=1e-06, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=lbfgs ...............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.4s remaining:    0.0s


[CV] ..... C=1e-06, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=newton-cg ...........................
[CV] . C=1e-06, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=newton-cg ...........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.6s remaining:    0.0s
ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] . C=1e-06, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=newton-cg ...........................
[CV] . C=1e-06, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-06, penalty=l1, solver=newton-cg ...........................
[CV] . C=1e-06, penalty=l1, solver=newton-cg, score=nan, total=   0.2s
[CV] C=1e-06, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l1, solver=liblinear, score=0.500, total=   1.2s
[CV] C=1e-06, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-06, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-06, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-06, penalty=l2, solver=lbfgs ...............................
[C



[CV]  C=1e-06, penalty=l2, solver=newton-cg, score=0.525, total=  16.5s
[CV] C=1e-06, penalty=l2, solver=newton-cg ...........................
[CV]  C=1e-06, penalty=l2, solver=newton-cg, score=0.525, total=   2.4s
[CV] C=1e-06, penalty=l2, solver=newton-cg ...........................
[CV]  C=1e-06, penalty=l2, solver=newton-cg, score=0.526, total=   2.4s
[CV] C=1e-06, penalty=l2, solver=newton-cg ...........................
[CV]  C=1e-06, penalty=l2, solver=newton-cg, score=0.526, total=   2.5s
[CV] C=1e-06, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l2, solver=liblinear, score=0.701, total=   1.4s
[CV] C=1e-06, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l2, solver=liblinear, score=0.701, total=   1.4s
[CV] C=1e-06, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-06, penalty=l2, solver=liblinear, score=0.700, total=   1.4s
[CV] C=1e-06, penalty=l2, solver=liblinear ...........................

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..... C=1e-05, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=1e-05, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=1e-05, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=newton-cg ...........................
[CV] . C=1e-05, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=newton-cg ...........................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] . C=1e-05, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=newton-cg ...........................
[CV] . C=1e-05, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=newton-cg ...........................
[CV] . C=1e-05, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1e-05, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-05, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-05, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-05, penalty=l1, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=1e-05, penalty=l2, solver=lbfgs ...............................
[C



[CV]  C=1e-05, penalty=l2, solver=newton-cg, score=0.703, total=  13.0s
[CV] C=1e-05, penalty=l2, solver=newton-cg ...........................
[CV]  C=1e-05, penalty=l2, solver=newton-cg, score=0.703, total=   4.9s
[CV] C=1e-05, penalty=l2, solver=newton-cg ...........................




[CV]  C=1e-05, penalty=l2, solver=newton-cg, score=0.702, total=   5.2s
[CV] C=1e-05, penalty=l2, solver=newton-cg ...........................




[CV]  C=1e-05, penalty=l2, solver=newton-cg, score=0.703, total=  14.8s
[CV] C=1e-05, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l2, solver=liblinear, score=0.717, total=   1.7s
[CV] C=1e-05, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l2, solver=liblinear, score=0.718, total=   1.7s
[CV] C=1e-05, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l2, solver=liblinear, score=0.716, total=   1.7s
[CV] C=1e-05, penalty=l2, solver=liblinear ...........................
[CV]  C=1e-05, penalty=l2, solver=liblinear, score=0.717, total=   1.7s
[CV] C=0.0001, penalty=l1, solver=lbfgs ..............................
[CV] .... C=0.0001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=lbfgs ..............................


ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] .... C=0.0001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=lbfgs ..............................
[CV] .... C=0.0001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=lbfgs ..............................
[CV] .... C=0.0001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=newton-cg ..........................
[CV]  C=0.0001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=newton-cg ..........................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV]  C=0.0001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=newton-cg ..........................
[CV]  C=0.0001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=newton-cg ..........................
[CV]  C=0.0001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=0.0001, penalty=l1, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l1, solver=liblinear, score=0.500, total=   1.1s
[CV] C=0.0001, penalty=l2, solver=lbfgs .............................



[CV]  C=0.0001, penalty=l2, solver=newton-cg, score=0.726, total=   3.8s
[CV] C=0.0001, penalty=l2, solver=newton-cg ..........................
[CV]  C=0.0001, penalty=l2, solver=newton-cg, score=0.727, total=   3.5s
[CV] C=0.0001, penalty=l2, solver=newton-cg ..........................
[CV]  C=0.0001, penalty=l2, solver=newton-cg, score=0.726, total=   3.4s
[CV] C=0.0001, penalty=l2, solver=newton-cg ..........................
[CV]  C=0.0001, penalty=l2, solver=newton-cg, score=0.727, total=   3.4s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.726, total=   1.9s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.728, total=   1.9s
[CV] C=0.0001, penalty=l2, solver=liblinear ..........................
[CV]  C=0.0001, penalty=l2, solver=liblinear, score=0.726, total=   1.9s
[CV] C=0.0001, penalty=l2, solver=liblinear ...................

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..... C=0.001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=0.001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=lbfgs ...............................
[CV] ..... C=0.001, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=newton-cg ...........................
[CV] . C=0.001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=newton-cg ...........................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] . C=0.001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=newton-cg ...........................
[CV] . C=0.001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=newton-cg ...........................
[CV] . C=0.001, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.001, penalty=l1, solver=liblinear ...........................
[CV]  C=0.001, penalty=l1, solver=liblinear, score=0.559, total=   1.6s
[CV] C=0.001, penalty=l1, solver=liblinear ...........................
[CV]  C=0.001, penalty=l1, solver=liblinear, score=0.560, total=   2.0s
[CV] C=0.001, penalty=l1, solver=liblinear ...........................
[CV]  C=0.001, penalty=l1, solver=liblinear, score=0.560, total=   2.2s
[CV] C=0.001, penalty=l1, solver=liblinear ...........................
[CV]  C=0.001, penalty=l1, solver=liblinear, score=0.559, total=   1.8s
[CV] C=0.001, penalty=l2, solver=lbfgs ...............................
[C

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ...... C=0.01, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=lbfgs ................................
[CV] ...... C=0.01, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=lbfgs ................................
[CV] ...... C=0.01, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=newton-cg ............................
[CV] .. C=0.01, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=newton-cg ............................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] .. C=0.01, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=newton-cg ............................
[CV] .. C=0.01, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=newton-cg ............................
[CV] .. C=0.01, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.01, penalty=l1, solver=liblinear ............................
[CV]  C=0.01, penalty=l1, solver=liblinear, score=0.707, total=   2.6s
[CV] C=0.01, penalty=l1, solver=liblinear ............................
[CV]  C=0.01, penalty=l1, solver=liblinear, score=0.710, total=   2.5s
[CV] C=0.01, penalty=l1, solver=liblinear ............................
[CV]  C=0.01, penalty=l1, solver=liblinear, score=0.707, total=   2.7s
[CV] C=0.01, penalty=l1, solver=liblinear ............................
[CV]  C=0.01, penalty=l1, solver=liblinear, score=0.707, total=   2.9s
[CV] C=0.01, penalty=l2, solver=lbfgs ................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ....... C=0.1, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=lbfgs .................................
[CV] ....... C=0.1, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=lbfgs .................................
[CV] ....... C=0.1, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=newton-cg .............................
[CV] ... C=0.1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=newton-cg .............................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ... C=0.1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=newton-cg .............................
[CV] ... C=0.1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=newton-cg .............................
[CV] ... C=0.1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] . C=0.1, penalty=l1, solver=liblinear, score=0.760, total=   3.6s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] . C=0.1, penalty=l1, solver=liblinear, score=0.762, total=   3.6s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] . C=0.1, penalty=l1, solver=liblinear, score=0.760, total=   3.7s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] . C=0.1, penalty=l1, solver=liblinear, score=0.761, total=   3.6s
[CV] C=0.1, penalty=l2, solver=lbfgs .................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ....... C=0.5, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=lbfgs .................................
[CV] ....... C=0.5, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=lbfgs .................................
[CV] ....... C=0.5, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=newton-cg .............................
[CV] ... C=0.5, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=newton-cg .............................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ... C=0.5, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=newton-cg .............................
[CV] ... C=0.5, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=newton-cg .............................
[CV] ... C=0.5, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=0.5, penalty=l1, solver=liblinear .............................
[CV] . C=0.5, penalty=l1, solver=liblinear, score=0.773, total=   4.2s
[CV] C=0.5, penalty=l1, solver=liblinear .............................
[CV] . C=0.5, penalty=l1, solver=liblinear, score=0.775, total=   4.2s
[CV] C=0.5, penalty=l1, solver=liblinear .............................
[CV] . C=0.5, penalty=l1, solver=liblinear, score=0.774, total=   4.2s
[CV] C=0.5, penalty=l1, solver=liblinear .............................
[CV] . C=0.5, penalty=l1, solver=liblinear, score=0.775, total=   4.1s
[CV] C=0.5, penalty=l2, solver=lbfgs .................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ......... C=1, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=lbfgs ...................................
[CV] ......... C=1, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=lbfgs ...................................
[CV] ......... C=1, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=newton-cg ...............................
[CV] ..... C=1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=newton-cg ...............................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..... C=1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=newton-cg ...............................
[CV] ..... C=1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=newton-cg ...............................
[CV] ..... C=1, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=1, penalty=l1, solver=liblinear ...............................
[CV] ... C=1, penalty=l1, solver=liblinear, score=0.775, total=   4.6s
[CV] C=1, penalty=l1, solver=liblinear ...............................
[CV] ... C=1, penalty=l1, solver=liblinear, score=0.778, total=   4.7s
[CV] C=1, penalty=l1, solver=liblinear ...............................
[CV] ... C=1, penalty=l1, solver=liblinear, score=0.775, total=   4.7s
[CV] C=1, penalty=l1, solver=liblinear ...............................
[CV] ... C=1, penalty=l1, solver=liblinear, score=0.777, total=   4.7s
[CV] C=1, penalty=l2, solver=lbfgs ...................................
[CV] .

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ......... C=2, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=lbfgs ...................................
[CV] ......... C=2, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=lbfgs ...................................
[CV] ......... C=2, penalty=l1, solver=lbfgs, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=newton-cg ...............................
[CV] ..... C=2, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=newton-cg ...............................


ValueError: Solver newton-cg supports only 'l2' or 'none' penalties, got l1 penalty.



[CV] ..... C=2, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=newton-cg ...............................
[CV] ..... C=2, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=newton-cg ...............................
[CV] ..... C=2, penalty=l1, solver=newton-cg, score=nan, total=   0.1s
[CV] C=2, penalty=l1, solver=liblinear ...............................
[CV] ... C=2, penalty=l1, solver=liblinear, score=0.774, total=   6.0s
[CV] C=2, penalty=l1, solver=liblinear ...............................
[CV] ... C=2, penalty=l1, solver=liblinear, score=0.777, total=   5.5s
[CV] C=2, penalty=l1, solver=liblinear ...............................
[CV] ... C=2, penalty=l1, solver=liblinear, score=0.774, total=   5.6s
[CV] C=2, penalty=l1, solver=liblinear ...............................
[CV] ... C=2, penalty=l1, solver=liblinear, score=0.776, total=   5.6s
[CV] C=2, penalty=l2, solver=lbfgs ...................................
[CV] .

[Parallel(n_jobs=1)]: Done 216 out of 216 | elapsed: 24.2min finished


GridSearchCV(cv=4, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=750001, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 0.5, 1,
                               2],
                         'penalty': ['l1', 'l2'],
                         'solver': ['lbfgs', 'newton-cg', 'liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=5

In [None]:
#Accuracy score for the model

#Accuracy score (True Positives and True negatives)
print(grid.score(testX_vector, testY)) 

#f-1 score (False Negatives and False Positives)
print(f1_score(testY, grid.predict(testX_vector), average = None))

#Predict method
print(grid.predict(testX_vector[0]))

In [None]:
#Testing out our model 

#Uploading judgement data into a new dataframe
uploaded = files.upload()
df2 = pd.read_csv(io.BytesIO(uploaded['contestant_judgment.csv']))




Saving contestant_judgment.csv to contestant_judgment (1).csv


In [None]:
#Function to vectorize and predict
def final(list1):
  new_test = tfv.transform(list1)
  if grid.predict(new_test) == "Positive":
    return 1
  else:
    return 0
  
#Loop that appends all the sentiments 
sentimentList = []
for index, rows in df2.iterrows():
  list1 = [df2.iloc[index, 2]]
  sentimentList.append(final(list1))


#Create new column in the contestant judgement data frame and add the values as sentimentList
df2['Sentiment_Prediction'] = pd.Series(sentimentList)

#Convert the contestant judgement data frame into a new csv file.
df2.to_csv('contestant_judgement.csv')