In [24]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import pandas as pd
import numpy as np
import ast
import re
import time
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score




def pre_processing(df):
    t0 = time.time()
    df['plot'] = df['plot'].str.lower() #Converting to lowercase    
    plot_list = []
    for i in range(df.shape[0]):
        text = df['plot'][i] #Pre-processing only done on 'plot' column
        text = re.sub("[^a-zA-Z]"," ",text) #Removing all non-alphabet characters
        text_tokens = [token for token in text.split() if token.lower() not in ENGLISH_STOP_WORDS] #Removing stopwords
        text_tokens = " ".join(text_tokens)
        plot_list.append(text_tokens)
        
    
    ps = PorterStemmer()
    plot_stem = []
    for j in range(len(plot_list)):
        stem_list = []
        for w in plot_list[j]:
            stem_list.append(ps.stem(w)) #Removing stems from each row of pre-processed data
        str1 = ""
        stem_list = str1.join(stem_list)
        plot_stem.append(stem_list)
    
        
    for i in range(df['genre'].shape[0]):
        df['genre'][i] = ast.literal_eval(df['genre'][i]) #String to list ('[]' to []). NOTE: This returns an error message, but does not affect the functioning of the code        
    
    multilabel_binarizer = MultiLabelBinarizer()
    multilabel_binarizer.fit(df['genre'])
    genre_encoded = multilabel_binarizer.transform(df['genre']) #Creates 1x20 list for each row, where each element represents one genre (binary representation)

    t1 = time.time()
    total = t1-t0
    print('Time taken to pre-process data = ',total,'\n')

    df['plot_processed'] = plot_stem #Final pre-processed data added as new column to df
    
    return df,genre_encoded

df = pd.read_csv('train.csv')
df_train,y_train = pre_processing(df) #Running pre-processing on training data


# Creating Machine Learning Model
vectorizer = TfidfVectorizer(max_df = 0.5,max_features=10000)
X_train = vectorizer.fit_transform(df_train['plot_processed']) #Fitting tfidf vectorizer to training data
    
tfidf_output = pd.DataFrame(data = X_train.toarray(),columns = vectorizer.get_feature_names())
print('Feature Vectors Generated using TFidf: ','\n\n',tfidf_output)
    
lr = LogisticRegression()
clf = OneVsRestClassifier(lr) #Allowing logistic regression to work on multilabel tasks
clf.fit(X_train, y_train) #Training model using training data

df = pd.read_csv('test.csv')
df_test,y_test = pre_processing(df) #Running pre-processing on test data

#Evaluation
X_test = vectorizer.transform(df_test['plot_processed']) #Applying model to test data
#y_pred = clf.predict(X_test)
y_pred = clf.predict_proba(X_test) #Getting probability estimate for predicted values
y_pred_thres = (y_pred_prob >= 0.2) #Setting threshold for logistic regression predicted value (default = 0.5)
print('F1 Score is: ',f1_score(y_test, y_pred_new, average="micro"))




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genre'][i] = ast.literal_eval(df['genre'][i]) #String to list ('[]' to []). NOTE: This returns an error message, but does not affect the functioning of the code


Time taken to pre-process data =  19.76025390625 

Feature Vectors Generated using TFidf:  

        aakash  aaron  aarti  abandon  abandoned  abandoning  abandons  abbey  \
0         0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
1         0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
2         0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
3         0.0    0.0    0.0      0.0   0.017504         0.0       0.0    0.0   
4         0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
...       ...    ...    ...      ...        ...         ...       ...    ...   
24995     0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
24996     0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
24997     0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
24998     0.0    0.0    0.0      0.0   0.000000         0.0       0.0    0.0   
24999     0.0    0.0    0.

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['genre'][i] = ast.literal_eval(df['genre'][i]) #String to list ('[]' to []). NOTE: This returns an error message, but does not affect the functioning of the code


Time taken to pre-process data =  3.87155818939209 

F1 Score is:  0.549186676994578


In [None]:
#Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB

y_new = np.array(y_train,dtype=str)
y_train_new = []
for row in range(y_new.shape[0]):
    y_train_new.append("".join(y_new[row]))
#print(y_train_new)


y_new2 = np.array(y_train,dtype=str)
y_test_new = []
for row in range(y_new2.shape[0]):
    y_test_new.append("".join(y_new2[row]))
#print(y_test_new)

clf = GaussianNB()
clf.fit(X_train.toarray(), y_train_new)
print ('Accuracy:{0: .1f}%'.format(clf.score(X_test.toarray(), y_test_new) * 100))


