In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, cross_val_score,cross_validate
from sklearn.metrics import classification_report
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [43]:
dataset_path = '/Users/babyhandzzz/Desktop/ELEPH@NT/Datasets/imdb_master.csv'

In [44]:
# Loading data
dataframe = pd.read_csv(dataset_path,encoding='latin1', usecols=['review','label'])
dataframe = dataframe.loc[dataframe.label != 'unsup']
dataframe.label.replace({'neg':0,'pos':1}, inplace=True)
X = dataframe[['review']]
y = dataframe[['label']]

In [45]:
# Text_pre_processing

def remove_punct(text):
    table = str.maketrans("","", string.punctuation)
    return text.translate(table)
X.review = X.review.map(lambda x: remove_punct(x))

stop = set(stopwords.words('english'))

def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

X.review = X.review.map(remove_stopwords)

In [5]:
# Train/Test split & reset indices
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

In [6]:
# tfidf transformation -> sparse matrix -> np.array -> pandas df
tfidf = TfidfVectorizer(use_idf=True, min_df=10, max_df=1.0)

X_train_tfidf = tfidf.fit_transform(X_train.review)
X_test_tfidf = tfidf.transform(X_test.review)
column_names = tfidf.get_feature_names()

X_train_tfidf = X_train_tfidf.toarray()
X_test_tfidf = X_test_tfidf.toarray()

X_train_tfidf = pd.DataFrame(X_train_tfidf, columns=column_names)
X_test_tfidf = pd.DataFrame(X_test_tfidf, columns=column_names)

In [None]:
def collect_metrics(model):
    predictions = model.predict(X_test_tfidf)
    class_report = classification_report(y_test, predictions)
    fpr, tpr, thresholds = roc_curve(y_test, predictions)
    auc_ = auc(fpr, tpr)
    
    print(class_report)
    print(auc_)

collect_metrics(model)

In [35]:
# this part
model = LogisticRegression()
cvs = cross_val_score(model, X_train_tfidf, y_train, cv=3)
print(cvs.mean())

In [57]:
model.fit(X_train_tfidf, y_train)

LogisticRegression()