## **NLP with Machine Learning**

In [1]:
import pandas as pd
pd.set_option('max_colwidth', None)

import openpyxl

In [2]:
data = pd.read_csv('../Data/movie_reviews.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166 entries, 0 to 165
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   movie_title         166 non-null    object
 1   rating              166 non-null    object
 2   genre               166 non-null    object
 3   in_theaters_date    166 non-null    object
 4   movie_info          166 non-null    object
 5   directors           166 non-null    object
 6   director_gender     166 non-null    object
 7   tomatometer_rating  166 non-null    int64 
 8   audience_rating     166 non-null    int64 
 9   critics_consensus   147 non-null    object
dtypes: int64(2), object(8)
memory usage: 13.1+ KB


### **Sentiment Analysis**

In [4]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(data.loc[0,'movie_info'])

{'neg': 0.051, 'neu': 0.694, 'pos': 0.255, 'compound': 0.9837}

In [5]:
analyzer.polarity_scores(data.loc[1,'movie_info'])

{'neg': 0.0, 'neu': 0.758, 'pos': 0.242, 'compound': 0.9237}

In [6]:
sent_scores = [analyzer.polarity_scores(line)['compound'] for line in data['movie_info']]

In [7]:
len(sent_scores)

166

In [8]:
data['sentiment_score'] = sent_scores

In [9]:
bottom_10 = data.sort_values(by='sentiment_score')[['movie_title','sentiment_score']].head(10)

In [10]:
top_10 = data.sort_values(by='sentiment_score')[['movie_title','sentiment_score']].tail(10)

In [11]:
bottom_10

Unnamed: 0,movie_title,sentiment_score
7,All Is True,-0.9955
148,The Wind,-0.9838
83,Nightmare Cinema,-0.9756
154,Triple Threat,-0.9696
11,Angel of Mine,-0.9687
27,Charlie Says,-0.9643
113,The Curse of La Llorona,-0.9628
87,Pet Sematary,-0.959
142,The Standoff at Sparrow Creek,-0.959
40,El Chicano,-0.9578


In [12]:
top_10

Unnamed: 0,movie_title,sentiment_score
71,Long Shot,0.9778
36,Dumbo,0.9801
0,A Dog's Journey,0.9837
49,Giant Little Ones,0.9839
93,Red Joan,0.9848
156,UglyDolls,0.9862
48,Five Feet Apart,0.9889
130,The Laundromat,0.9908
81,Missing Link,0.9909
23,Breakthrough,0.9915


### **Text Classification**

**Objective: to predict which movies are directed by feamales vs males**
- Clean and normalize the data
- Use CountVentorization to vectorize the dataset
- Use Naive Bayes and Logistics Regression models to fit the data
- Compare the results

In [13]:
## Clean and normalize the movie_info columns
from Text_Preprocessing import clean_and_normalize

data['data_clean'] = clean_and_normalize(data.movie_info)


In [14]:
## Vectorize the column using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english',min_df=0.1,ngram_range=(1,2))
dtm = cv.fit_transform(data.data_clean)
X = pd.DataFrame(dtm.toarray(), columns=cv.get_feature_names_out())

In [15]:
X.head()

Unnamed: 0,begin,discover,family,film,follow,force,friend,home,leave,life,...,man,new,set,star,story,turn,woman,world,year,young
0,1,0,0,0,0,0,1,0,1,3,...,0,2,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,1,0,1,...,0,1,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [16]:
## Split the data into train and test datasets
from sklearn.model_selection import train_test_split

y = data['director_gender']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

#### **Naive Bayes Model**

In [17]:
## Fit the dataset into Naive Bayes model

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train,y_train)

In [18]:
nb.predict(X_test)

array(['male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'female', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'male', 'male', 'male', 'male',
       'male', 'male', 'male', 'male', 'female', 'male', 'male', 'female',
       'male', 'female'], dtype='<U6')

In [19]:
## Evaluate the model

from sklearn.metrics import accuracy_score, classification_report

print(f'Accuracy Score: {accuracy_score(y_test,nb.predict(X_test))}')
print(classification_report(y_test,nb.predict(X_test)))

Accuracy Score: 0.7941176470588235
              precision    recall  f1-score   support

      female       0.25      0.20      0.22         5
        male       0.87      0.90      0.88        29

    accuracy                           0.79        34
   macro avg       0.56      0.55      0.55        34
weighted avg       0.78      0.79      0.78        34



#### **Logistic Regression Model**

In [20]:
## Fit the dataset into a Logistic Regression
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)


In [21]:
print(f'Accuracy Score: {accuracy_score(y_test,lr.predict(X_test))}')
print(classification_report(y_test,lr.predict(X_test)))

Accuracy Score: 0.7647058823529411
              precision    recall  f1-score   support

      female       0.29      0.40      0.33         5
        male       0.89      0.83      0.86        29

    accuracy                           0.76        34
   macro avg       0.59      0.61      0.60        34
weighted avg       0.80      0.76      0.78        34



In [22]:
data['female_director_prediction'] = lr.predict_proba(X)[:,0]

In [23]:
(data.loc[:,['movie_title','director_gender','female_director_prediction']]
    .sort_values(by='female_director_prediction',ascending=False)
)

Unnamed: 0,movie_title,director_gender,female_director_prediction
55,Greta,male,0.840252
27,Charlie Says,female,0.721752
140,The Secret Life of Pets 2,male,0.692048
76,Mary Magdalene,male,0.671338
69,Little,female,0.620955
...,...,...,...
56,Gwen,male,0.014585
57,Hampstead,male,0.010669
47,Fighting with My Family,male,0.008816
155,Tyler Perry's A Madea Family Funeral,male,0.007876


### **Topic Modelling**

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
## Vectorize the clean data with TF-IDF Venstorizer
tv = TfidfVectorizer(stop_words='english',min_df=0.05, max_df=0.2)
tfidf = tv.fit_transform(data['data_clean'])

In [82]:
feature_names = tv.get_feature_names_out()

In [83]:
tfidf_df = pd.DataFrame(tfidf.toarray(),columns=feature_names)

In [84]:
tfidf_df

Unnamed: 0,academy,action,adventure,american,base,battle,begin,black,bring,change,...,true,try,turn,unexpected,war,way,woman,work,year,young
0,0.0,0.0,0.264584,0.0,0.000000,0.0,0.234867,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.00000
1,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.37314,0.0,...,0.0,0.0,0.0,0.0,0.0,0.37314,0.000000,0.0,0.0,0.00000
2,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.00000
3,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.360338,0.0,0.0,0.00000
4,0.0,0.0,0.000000,0.0,0.309702,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
161,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.00000
162,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.316159,0.0,0.0,0.00000
163,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.397599,0.0,0.0,0.40279
164,0.0,0.0,0.000000,0.0,0.000000,0.0,0.431795,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00000,0.397393,0.0,0.0,0.00000


In [85]:
## Create NMF (Negative Matrix Factorizaion) model to find the main topics
from sklearn.decomposition import NMF

In [99]:
nmf = NMF(n_components=6, random_state=42)
W = nmf.fit_transform(tfidf_df)
H = nmf.components_

In [100]:
## Generator function to iterate over each topic(row) of H
def get_topics(H,num_words=10):
    for topic_num, topic_array in enumerate(H):
        top_features = topic_array.argsort()[::-1][:num_words] #Returns indices of top words
        top_words = [feature_names[i] for i in top_features] #Get the corresponding top words
        # print('Topic',topic_num+1,':',' ,'.join(top_words))
        yield topic_num, top_words

In [101]:
topic_list = list(
    (topic_num +1, ' ,'.join(top_words)) for topic_num, top_words in get_topics(H)
)

In [102]:
topic_list

[(1, 'family ,father ,home ,search ,day ,past ,try ,daughter ,journey ,fight'),
 (2, 'set ,force ,turn ,evil ,follow ,night ,begin ,bring ,way ,play'),
 (3, 'film ,true ,event ,inspire ,base ,year ,comedy ,come ,follow ,academy'),
 (4, 'friend ,good ,dream ,help ,live ,town ,love ,begin ,adventure ,leave'),
 (5, 'woman ,young ,mother ,face ,child ,thriller ,star ,hope ,think ,grow'),
 (6, 'man ,lead ,war ,black ,discover ,secret ,return ,action ,series ,city')]

In [115]:
topics = ['family','horror','true events','romantic','thriller','war']

In [125]:
pd.DataFrame(W, columns=topics)

Unnamed: 0,family,horror,true events,romantic,thriller,war
0,0.000000,0.000000,0.000000,0.395842,0.000000,0.000000
1,0.046118,0.197497,0.000000,0.114081,0.000000,0.012323
2,0.073950,0.000000,0.000000,0.109129,0.000000,0.000000
3,0.000000,0.000000,0.075745,0.000000,0.321728,0.000000
4,0.026065,0.018300,0.080497,0.105446,0.045193,0.037647
...,...,...,...,...,...,...
161,0.000000,0.266801,0.000000,0.000000,0.081399,0.000000
162,0.000000,0.000000,0.312062,0.000000,0.196328,0.000000
163,0.000000,0.000000,0.000000,0.000000,0.510853,0.000000
164,0.000000,0.000000,0.000000,0.282126,0.148251,0.000000


In [126]:
## Combine with original dataframe
movie_cluster = pd.concat([data[['movie_title','movie_info']],pd.DataFrame(W, columns=topics)],axis=1)

In [127]:
movie_cluster['movie_type'] = movie_cluster.iloc[:,2:].idxmax(axis=1)

In [128]:
## Add sentiment scores for each movie
movie_cluster['sentiment_score'] = data['sentiment_score']

In [129]:
movie_cluster[['movie_title','movie_info','movie_type','sentiment_score']]

Unnamed: 0,movie_title,movie_info,movie_type,sentiment_score
0,A Dog's Journey,"Bailey (voiced again by Josh Gad) is living the good life on the Michigan farm of his ""boy,"" Ethan (Dennis Quaid) and Ethan's wife Hannah (Marg Helgenberger). He even has a new playmate: Ethan and Hannah's baby granddaughter, CJ. The problem is that CJ's mom, Gloria (Betty Gilpin), decides to take CJ away. As Bailey's soul prepares to leave this life for a new one, he makes a promise to Ethan to find CJ and protect her at any cost. Thus begins Bailey's adventure through multiple lives filled with love, friendship and devotion as he, CJ (Kathryn Prescott), and CJ's best friend Trent (Henry Lau) experience joy and heartbreak, music and laughter, and few really good belly rubs.",romantic,0.9837
1,A Dog's Way Home,"Separated from her owner, a dog sets off on an 400-mile journey to get back to the safety and security of the place she calls home. Along the way, she meets a series of new friends and manages to bring a little bit of comfort and joy to their lives.",horror,0.9237
2,A Tuba to Cuba,"The leader of New Orleans' famed Preservation Hall Jazz Band seeks to fulfill his late father's dream of retracing their musical roots to the shores of Cuba in search of the indigenous music that gave birth to New Orleans jazz. A TUBA TO CUBA celebrates the triumph of the human spirit expressed through the universal language of music and challenges us to resolve to build bridges, not walls.",romantic,0.9360
3,A Vigilante,"A once abused woman, Sadie (Olivia Wilde), devotes herself to ridding victims of their domestic abusers while hunting down the husband she must kill to truly be free. A Vigilante is a thriller inspired by the strength and bravery of real domestic abuse survivors and the incredible obstacles to safety they face.",thriller,-0.0334
4,After,"Based on Anna Todd's best-selling novel which became a publishing sensation on social storytelling platform Wattpad, AFTER follows Tessa (Langford), a dedicated student, dutiful daughter and loyal girlfriend to her high school sweetheart, as she enters her first semester in college. Armed with grand ambitions for her future, her guarded world opens up when she meets the dark and mysterious Hardin Scott (Tiffin), a magnetic, brooding rebel who makes her question all she thought she knew about herself and what she wants out of life.",romantic,0.9349
...,...,...,...,...
161,Velvet Buzzsaw,"Velvet Buzzsaw is a satirical thriller set in the contemporary art world scene of Los Angeles, where big money artists and mega-collectors pay a high price when art collides with commerce.",horror,-0.2732
162,What Men Want,"Inspired by the Nancy Meyers hit romantic comedy WHAT WOMEN WANT, this film follows the story of a female sports agent (Henson) who has been constantly boxed out by her male colleagues. When she gains the power to hear mens' thought, she is able to shift the paradigm to her advantage as she races to sign the NBA's next superstar",true events,0.9158
163,Wild Rose,"WILD ROSE tells the complicated story of Rose-Lynn, a woman on a quest to become a country music star, while also grappling with the responsibilities of being recently released from prison and a young mother of two children.",thriller,-0.5106
164,Wine Country,"In honor of Rebecca (Rachel Dratch)'s 50th birthday, Abby (Amy Poehler) plans a scenic Napa getaway with their best, longtime friends. Workaholic Catherine (Ana Gasteyer), post-op Val (Paula Pell), homebody Jenny (Emily Spivey), and weary mom Naomi (Maya Rudolph) are equally sold on the chance to relax and reconnect. Yet as the alcohol flows, real world uncertainties intrude on the punchlines and gossip, and the women begin questioning their friendships and futures.",romantic,0.9081
