In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, cross_val_score,cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [14]:
dataset_path = '/Users/babyhandzzz/Desktop/ELEPH@NT/Datasets/imdb_master.csv'

In [15]:
# Loading data
dataframe = pd.read_csv(dataset_path,encoding='latin1', usecols=['review','label'])
dataframe = dataframe.loc[dataframe.label != 'unsup']
dataframe.label.replace({'neg':0,'pos':1}, inplace=True)
X = dataframe[['review']]
y = dataframe[['label']]

In [26]:
dataframe.review[0]

"Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in."

# Pre-processing

In [22]:
# Text_pre_processing

def remove_punct(text):
    table = str.maketrans("","", string.punctuation)
    return text.translate(table)
X.review = X.review.map(lambda x: remove_punct(x))

stop = set(stopwords.words('english'))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

X.review = X.review.map(remove_stopwords)

In [24]:
X.review[0]

'mr costner dragged movie far longer necessary aside terrific sea rescue sequences care characters us ghosts closet costners character realized early forgotten much later time care character really care cocky overconfident ashton kutcher problem comes kid thinks hes better anyone else around shows signs cluttered closet obstacle appears winning costner finally well past half way point stinker costner tells us kutchers ghosts told kutcher driven best prior inkling foreshadowing magic could keep turning hour'

# Performance metrics

In [None]:
def collect_metrics(model):
    predictions = model.predict(X_test_tfidf)
    class_report = classification_report(y_test, predictions)
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    auc_ = auc(fpr, tpr)
    
    print(class_report)
    print(auc_)

# Train/Test split

In [5]:
# Train/Test split & reset indices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [6]:
# tfidf transformation -> sparse matrix -> np.array -> pandas df
tfidf = TfidfVectorizer(use_idf=True, min_df=10, max_df=1.0)

X_train_tfidf = tfidf.fit_transform(X_train.review)
X_test_tfidf = tfidf.transform(X_test.review)
column_names = tfidf.get_feature_names()

X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

X_train_tfidf = pd.DataFrame(X_train_tfidf, columns=column_names)
X_test_tfidf = pd.DataFrame(X_test_tfidf, columns=column_names)

In [35]:
model = LogisticRegression()
cvs = cross_val_score(model, X_train_tfidf, y_train, cv=3)
print(cvs.mean())

In [57]:
model.fit(X_train_tfidf, y_train)
collect_metrics(model)

LogisticRegression()

In [9]:
### cross_validation ###
tfidf = TfidfVectorizer(use_idf=True, min_df=10, max_df=1.0)
X = tfidf.fit_transform(X.review)
column_names = tfidf.get_feature_names()
X = X.toarray()
X = pd.DataFrame(X, columns=column_names)

In [12]:
model = LogisticRegression()
cvs = cross_val_score(model, X, y, cv=3)
print(cvs.mean())

0.880379991104126
