# **Kaggle API linking**

Fetching the dataset from Kaggle using my Kaggle token and Dataset's API command

In [None]:
from google.colab import files
files.upload()

!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
!unzip /content/imdb-dataset-of-50k-movie-reviews.zip

# **Import necessary packages**

In [25]:
import numpy as np
import time
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# **Load dataset**

Loading the ```.csv``` dataset as Pandas Dataframe and removing the punctuations or other unnecessary elements from the reviews using Regular Expressions.

In [26]:
import pandas as pd
df = pd.read_csv('/content/IMDB Dataset.csv')
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
import re
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    return text

df['review'] = df['review'].apply(preprocessor)
df.head(5)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there s a family where a little boy ...,negative
4,petter mattei s love in the time of money is a...,positive


# **Data Preparation**

Dividing the dataset into Train and Test set. 70% is taken for Training and rest for Testing. Before dividing, the indices have been shuffled.

In [28]:
num_samples = len(df['review'])
num_train = int(num_samples * 0.7)
num_test = int(num_samples * 0.3)
random_indices = np.random.permutation(num_samples)

In [29]:
X_train = df.loc[random_indices[:num_train], 'review'].values
y_train = df.loc[random_indices[:num_train], 'sentiment'].values
X_test = df.loc[random_indices[-num_test:], 'review'].values
y_test = df.loc[random_indices[-num_test:], 'sentiment'].values

In [30]:
print("X_train : ",X_train.shape," X_test : ",X_test.shape,"\ny_train : ",y_train.shape," y_test : ",y_test.shape)

X_train :  (35000,)  X_test :  (15000,) 
y_train :  (35000,)  y_test :  (15000,)


# **Model**

The model is built as a Pipeline of TF-IDF and Logistic Regression Classifier. At first, TF-IDF is computed and passed to the Classifier.

In [31]:
tfidf = TfidfVectorizer()
classifier = LogisticRegressionCV(max_iter = 4000)
clf = Pipeline([('tfidf',tfidf), ('clf',classifier)])
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=None,
                                      dual=False, fit_intercept=True,
    

In [32]:
y_pred = clf.predict(X_test)

In [33]:
confusion_matrix(y_test, y_pred)

array([[6718,  816],
       [ 702, 6764]])

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      7534
    positive       0.89      0.91      0.90      7466

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



In [35]:
accuracy_score(y_test, y_pred)

0.8988