# Importing Data from a CSV file

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('C:/Users/alami/OneDrive/Desktop/Speech recognition thesis/working with data/IMDB_Dataset.csv')

In [3]:
df.shape

(50000, 2)

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [6]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [7]:
X = df['review']
y = df['sentiment']

In [9]:
X.shape

(50000,)

In [10]:
y.shape

(50000,)

# Splitting Data for Training & Testing

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [13]:
X_train.head()

42016    This movie is powerful. I watched this movie a...
32353    I rented "The China Syndrome" recently mainly ...
32118    Shinjuku Triad Society, albeit from perfect, i...
38310    Sick of the current cinema output, particularl...
7305     Disney's done it again. The company that made ...
Name: review, dtype: object

In [14]:
X_train.shape

(35000,)

In [15]:
X_test.shape

(15000,)

In [16]:
y_train.shape

(35000,)

In [17]:
y_test.shape

(15000,)

# Logistic Regression for Text Classification

In [18]:
from sklearn.linear_model import LogisticRegression

In [19]:
model_LR = LogisticRegression()

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
vectorizer = CountVectorizer()

In [22]:
vectorizer.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [23]:
X_train_vectors = vectorizer.transform(X_train)

In [24]:
X_test_vectors = vectorizer.transform(X_test)

In [25]:
model_LR.fit(X_train_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [26]:
model_LR.score(X_test_vectors, y_test)

0.8904666666666666

In [27]:
y_predictions = model_LR.predict(X_test_vectors)

In [28]:
from sklearn import metrics

In [29]:
print(metrics.classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    negative       0.89      0.89      0.89      7507
    positive       0.89      0.89      0.89      7493

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [39]:
print(metrics.accuracy_score(y_test, y_predictions))

0.8914666666666666


---
Use of random_state for a consistent split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [32]:
X_train.head()

38094    As much as I love trains, I couldn't stomach t...
40624    This was a very good PPV, but like Wrestlemani...
49425    Not finding the right words is everybody's pro...
35734    I'm really suprised this movie didn't get a hi...
41708    I'll start by confessing that I tend to really...
Name: review, dtype: object

# Using Machine Learning Pipelines

In [33]:
from sklearn.pipeline import Pipeline

In [35]:
classifier = Pipeline([('cv', CountVectorizer()), ('clf', LogisticRegression())])

In [36]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('cv',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                            

In [37]:
y_predictions = classifier.predict(X_test)

In [38]:
print(metrics.classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    negative       0.89      0.88      0.89      7411
    positive       0.89      0.90      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [40]:
print(metrics.accuracy_score(y_test, y_predictions))

0.8914666666666666


In [41]:
classifier.predict(["It was good movie, with a nice story"])

array(['positive'], dtype=object)

In [42]:
classifier.predict(["It wasn't a good movie, with really bad editing"])

array(['negative'], dtype=object)

# Using TF-IDF Vectorization

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [48]:
classifier = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LogisticRegression())])

In [49]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [50]:
y_predictions = classifier.predict(X_test)

In [51]:
print(metrics.classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      7411
    positive       0.89      0.91      0.90      7589

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



In [52]:
print(metrics.accuracy_score(y_test, y_predictions))

0.8993333333333333


# Applying N-grams with TF-IDF

In [53]:
classifier = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2))), ('clf', LogisticRegression())])

In [54]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling

In [55]:
y_predictions = classifier.predict(X_test)

In [56]:
print(metrics.classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      7411
    positive       0.89      0.91      0.90      7589

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000



In [57]:
print(metrics.accuracy_score(y_test, y_predictions))

0.9006


# Naive Bayes for Text Classification

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [60]:
classifier = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2))), ('clf', MultinomialNB())])

In [61]:
classifier.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [62]:
y_predictions = classifier.predict(X_test)

In [63]:
print(metrics.classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    negative       0.86      0.93      0.89      7411
    positive       0.92      0.85      0.88      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000



In [64]:
print(metrics.accuracy_score(y_test, y_predictions))

0.8875333333333333


# Linear SVC for Text Classification

In [65]:
from sklearn.svm import LinearSVC

In [66]:
classifier = Pipeline([('tfidf', TfidfVectorizer(ngram_range=(1,2))), ('clf', LinearSVC())])

In [67]:
classifier.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [68]:
y_predictions = classifier.predict(X_test)

In [69]:
print(metrics.classification_report(y_test, y_predictions))

              precision    recall  f1-score   support

    negative       0.93      0.91      0.92      7411
    positive       0.91      0.93      0.92      7589

    accuracy                           0.92     15000
   macro avg       0.92      0.92      0.92     15000
weighted avg       0.92      0.92      0.92     15000



In [71]:
print(metrics.accuracy_score(y_test, y_predictions))

0.9198666666666667
