# Movie Recommendation System Project - EDA 

### Importing packages

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import preprocessing

from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.tag import pos_tag, pos_tag_sents
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 

import re
import string

### Reading the CSV

In [2]:
movie_df = pd.read_csv('wiki_movie_plots_deduped.csv')

## 1. Preliminary EDA

### 1.1 General Inspection

### Looking at the top 5 rows

In [3]:
movie_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
0,1901,Kansas Saloon Smashers,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Kansas_Saloon_Sm...,"A bartender is working at a saloon, serving dr..."
1,1901,Love by the Light of the Moon,American,Unknown,,unknown,https://en.wikipedia.org/wiki/Love_by_the_Ligh...,"The moon, painted with a smiling face hangs ov..."
2,1901,The Martyred Presidents,American,Unknown,,unknown,https://en.wikipedia.org/wiki/The_Martyred_Pre...,"The film, just over a minute long, is composed..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,,unknown,"https://en.wikipedia.org/wiki/Terrible_Teddy,_...",Lasting just 61 seconds and consisting of two ...
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",,unknown,https://en.wikipedia.org/wiki/Jack_and_the_Bea...,The earliest known adaptation of the classic f...


### There are 34,886 rows (each represents one movie)

In [4]:
movie_df.shape

(34886, 8)

### Looking at dataframe datatypes 

In [5]:
movie_df.dtypes

Release Year         int64
Title               object
Origin/Ethnicity    object
Director            object
Cast                object
Genre               object
Wiki Page           object
Plot                object
dtype: object

### Looking at the first few plots to determine how the text data should be handled

In [6]:
movie_df['Plot'][0]

"A bartender is working at a saloon, serving drinks to customers. After he fills a stereotypically Irish man's bucket with beer, Carrie Nation and her followers burst inside. They assault the Irish man, pulling his hat over his eyes and then dumping the beer over his head. The group then begin wrecking the bar, smashing the fixtures, mirrors, and breaking the cash register. The bartender then sprays seltzer water in Nation's face before a group of policemen appear and order everybody to leave.[1]"

In [7]:
movie_df['Plot'][1]

"The moon, painted with a smiling face hangs over a park at night. A young couple walking past a fence learn on a railing and look up. The moon smiles. They embrace, and the moon's smile gets bigger. They then sit down on a bench by a tree. The moon's view is blocked, causing him to frown. In the last scene, the man fans the woman with his hat because the moon has left the sky and is perched over her shoulder to see everything better."

In [8]:
movie_df['Plot'][2]

'The film, just over a minute long, is composed of two shots. In the first, a girl sits at the base of an altar or tomb, her face hidden from the camera. At the center of the altar, a viewing portal displays the portraits of three U.S. Presidents—Abraham Lincoln, James A. Garfield, and William McKinley—each victims of assassination.\r\nIn the second shot, which runs just over eight seconds long, an assassin kneels feet of Lady Justice.'

## 1.2 Inspecting plot lengths more closely 

### Finding longest, shortest, and mean plot lengths

In [9]:
#longest plot length - characters
max_plot_c = movie_df['Plot'].str.len().max()
#longest plot length - words 
max_plot_w = movie_df['Plot'].str.split().str.len().max()

#shortest plot length - characters
min_plot_c = movie_df['Plot'].str.len().min()
#shortest plot length - words
min_plot_w = movie_df['Plot'].str.split().str.len().min()

#mean plot length - characters
mean_plot_c = movie_df['Plot'].str.len().mean()
#mean plot length - words
mean_plot_w = movie_df['Plot'].str.split().str.len().mean()

print("Max plot length - characters: ", max_plot_c)
print("Max plot length - words: ", max_plot_w)
print("Min plot length - characters: ", min_plot_c)
print("Min plot length - words: ", min_plot_w)
print("Mean plot length - characters: ", mean_plot_c)
print("Mean plot length - words: ", mean_plot_w)

Max plot length - characters:  36773
Max plot length - words:  6752
Min plot length - characters:  15
Min plot length - words:  2
Mean plot length - characters:  2165.0345410766495
Mean plot length - words:  372.4932064438457


### Looking at plots with 10 words or fewer

In [10]:
mask = (movie_df['Plot'].str.split().str.len() <= 10)
movie_df.loc[mask]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
25,1909,The Lure of the Gown,American,D.W. Griffith,Marion Leonard,unknown,https://en.wikipedia.org/wiki/The_Lure_of_the_...,The story as told by Moving Picture World read...
155,1916,Youth's Endearing Charm,American,William C. Dowlan,"Mary Miles Minter, Wallace MacDonald",drama,https://en.wikipedia.org/wiki/Youth%27s_Endear...,The film is about a court case and embezzlement.
881,1930,Africa Speaks!,American,Walter Futter,"Paul Hoefler, Lowell Thomas",documentary,https://en.wikipedia.org/wiki/Africa_Speaks!,Paul L. Hoefler heads a 1928 expedition to Afr...
1466,1933,Broadway Bad,American,Sidney Lanfield,"Joan Blondell, Ricardo Cortez, Ginger Rogers",drama,https://en.wikipedia.org/wiki/Broadway_Bad,Married chorus girl rides scandal to stardom.
1631,1934,As the Earth Turns,American,Alfred E. Green,"Donald Woods, Jean Muir, Dorothy Peterson",drama,https://en.wikipedia.org/wiki/As_the_Earth_Turns,A young couple farm in Maine.
...,...,...,...,...,...,...,...,...
33727,2013,Tokyo Family,Japanese,Yoji Yamada,"Bunta Sugawara, Tomiko Ishii, Masahiko Nishimura",drama,https://en.wikipedia.org/wiki/Tokyo_Family,"The film is set in Tokyo and Ōsakikamijima, Hi..."
33767,2014,Flower and Snake: Zero,Japanese,Hajime Hashimoto,"Maiko Amano, Noriko Hamada, Rina Sakuragi",erotic drama,https://en.wikipedia.org/wiki/Flower_and_Snake...,The film is set in Tokyo.[1]
33790,2014,Clover,Japanese,Takeshi Furusawa,"Emi Takei, Tadayoshi Okura",unknown,https://en.wikipedia.org/wiki/Clover_(2014_film),An office worker struggles to overcome her fir...
33832,2015,Galaxy Turnpike,Japanese,Kōki Mitani,"Shingo Katori, Haruka Ayase, Shun Oguri","science fiction, comedy",https://en.wikipedia.org/wiki/Galaxy_Turnpike,The film is set in 2265.[3]


### Looking at plots with 20 words or fewer, as it seems that those with 10 words or fewer offer very little information for topic modelling...

In [11]:
mask = (movie_df['Plot'].str.split().str.len() <= 20)
movie_df.loc[mask]

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
25,1909,The Lure of the Gown,American,D.W. Griffith,Marion Leonard,unknown,https://en.wikipedia.org/wiki/The_Lure_of_the_...,The story as told by Moving Picture World read...
28,1910,Frankenstein,American,J. Searle Dawley,"Augustus Phillips, Charles Stanton Ogle, Mary ...",unknown,https://en.wikipedia.org/wiki/Frankenstein_(19...,"Described as ""a liberal adaptation of Mrs. She..."
83,1914,In the Land of the Head Hunters,American,Edward S. Curtis,,documentary drama,https://en.wikipedia.org/wiki/In_the_Land_of_t...,The following plot synopsis was published in c...
117,1915,Four Feathers,American,J. Searle Dawley,"Edgar L. Davenport, Fuller Mellish",drama,https://en.wikipedia.org/wiki/Four_Feathers,Considered a coward by his fiancée and comrade...
118,1915,Fresh from the Farm,American,Hal Roach,Harold Lloyd,comedy,https://en.wikipedia.org/wiki/Fresh_from_the_Farm,"A farm youth goes to college, pursues the pret..."
...,...,...,...,...,...,...,...,...
34787,2017,Just for You,South_Korean,Park Byoung-hwan,"Tim, Son Ha-jung, Choi Jong-nam, Cha Soo-bin",unknown,https://en.wikipedia.org/wiki/Just_for_You_(20...,Story of a young man who travels to Vietnam an...
34792,2017,Yongsoon,South_Korean,Shin Joon,"Lee Soo-kyung, Choi Deok-moon",unknown,https://en.wikipedia.org/wiki/Yongsoon,Story of a teenage girl who discovers young lo...
34795,2017,Write or Dance,South_Korean,Lee Sang-deok,"Choi Si-hyung, Jeon Yeo-been, Chae Seo-jin",unknown,https://en.wikipedia.org/wiki/Write_or_Dance,The story of an aspiring novelist who experien...
34799,2017,The Table,South_Korean,Kim Jong-kwan,"Im Soo-jung, Jung Yu-mil, Han Ye-ri, Jung Eun-...",unknown,https://en.wikipedia.org/wiki/The_Table_(2016_...,The film follows four conversations between tw...


### Looking at individual plots with 20 words or fewer

In [12]:
movie_df['Plot'].iloc[25]

'The story as told by Moving Picture World reads:\r\n'

In [13]:
movie_df['Plot'].iloc[28]

'Described as "a liberal adaptation of Mrs. Shelley\'s famous story", the plot description in the Edison Kinetogram was:[3]'

In [14]:
movie_df['Plot'].iloc[83]

'The following plot synopsis was published in conjunction with a 1915 showing of the film at Carnegie Hall:'

### Plots with 20 words or fewer also offer very little information for topic modelling, 
### so will drop all these rows... 

In [15]:
#dropping rows with plots with 20 words or fewer 
movie_df.drop(movie_df[movie_df['Plot'].str.split().str.len() <= 20].index, inplace=True)
#resetting index
movie_df.reset_index(inplace=True)

### Looking at plot lengths again, now that plots with 20 words or fewer have been dropped

In [16]:
#longest plot length - characters
max_plot_c = movie_df['Plot'].str.len().max()
#longest plot length - words 
max_plot_w = movie_df['Plot'].str.split().str.len().max()

#shortest plot length - characters
min_plot_c = movie_df['Plot'].str.len().min()
#shortest plot length - words
min_plot_w = movie_df['Plot'].str.split().str.len().min()

#mean plot length - characters
mean_plot_c = movie_df['Plot'].str.len().mean()
#mean plot length - words
mean_plot_w = movie_df['Plot'].str.split().str.len().mean()

print("Max plot length - characters: ", max_plot_c)
print("Max plot length - words: ", max_plot_w)
print("Min plot length - characters: ", min_plot_c)
print("Min plot length - words: ", min_plot_w)
print("Mean plot length - characters: ", mean_plot_c)
print("Mean plot length - words: ", mean_plot_w)

Max plot length - characters:  36773
Max plot length - words:  6752
Min plot length - characters:  101
Min plot length - words:  21
Mean plot length - characters:  2206.516796772213
Mean plot length - words:  379.6329561734351


### 1.3 Looking for null values and duplicates

### Looking for null values

In [17]:
movie_df.isnull().sum()

index                  0
Release Year           0
Title                  0
Origin/Ethnicity       0
Director               0
Cast                1374
Genre                  0
Wiki Page              0
Plot                   0
dtype: int64

#### Genre will be dropped so the null values don't matter

### Looking for duplicate rows

In [18]:
movie_df.duplicated(subset=None, keep=False).sum()

0

### Looking for duplicate titles 

In [19]:
movie_df.duplicated(subset=['Title'], keep=False).sum()

4383

In [20]:
movie_df['Title'].value_counts()

Cinderella                    8
The Three Musketeers          8
Treasure Island               7
Alice in Wonderland           6
Hero                          6
                             ..
Priyamaina Neeku              1
Disorderly Conduct            1
Breakaway                     1
The Words                     1
Can a Song Save Your Life?    1
Name: Title, Length: 31830, dtype: int64

In [21]:
movie_df.duplicated(subset=['Title', 'Release Year'], keep=False).sum()

539

#### Looking for duplicate plots 

In [22]:
movie_df.duplicated(subset=['Plot'], keep=False).sum()

1960

### Looking for rows with the same title and plot 

In [23]:
movie_df.duplicated(subset=['Title','Release Year','Plot'], keep=False).sum()

464

#### There are more rows with the same plots than rows with the same titles AND plots which suggests some plots have different titles 

In [24]:
movie_df.nunique(axis=0)

index               34203
Release Year          117
Title               31830
Origin/Ethnicity       24
Director            12383
Cast                31576
Genre                2242
Wiki Page           33394
Plot                33194
dtype: int64

### Going to drop rows where title and release year are the same, keeping only one of the movies

In [25]:
movie_df.drop_duplicates(subset=['Title', 'Release Year'], keep='last', inplace=True)

#### Double checking that title + release year is now unique

In [26]:
movie_df.duplicated(subset=['Title', 'Release Year'], keep=False).sum()

0

### Creating a title + release year column so that each movie has a unique identifier

In [27]:
movie_df["Title Year"] = movie_df["Title"] + " " + movie_df["Release Year"].astype(str)

### Dropping redundant columns 
#### We only really need title, release year, and plot, but we will also keep origin/ethnicity and director just in case 

In [28]:
#dropping redundant columns
movie_df.drop(['index','Cast', 'Genre', 'Wiki Page'], axis=1, inplace=True)

#### Saving the df just in case

In [29]:
# saving the df
movie_df.to_csv('movie_1.csv', index = False)

## 2. Preliminary Cleaning/Preprocessing
(Removing stop words and tokenizing can be done within the TF-IDF vectorizer, lemmatizing will be done after that)

### 2.1 Remove "\r\n" from plots 

In [30]:
movie_df['Plot'] = movie_df['Plot'].replace('\r\n', '')

###  2.2 Remove numbers and punctuation 

In [31]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x) #remove numbers
punc = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x) #remove punctuation

movie_df['Plot'] = movie_df['Plot'].map(alphanumeric).map(punc)

### 2.3 Tokenize words

In [32]:
movie_df['Plot tokenized'] = movie_df['Plot'].apply(word_tokenize)

In [33]:
movie_df['Plot tokenized'][0]

['A',
 'bartender',
 'is',
 'working',
 'at',
 'a',
 'saloon',
 'serving',
 'drinks',
 'to',
 'customers',
 'After',
 'he',
 'fills',
 'a',
 'stereotypically',
 'Irish',
 'man',
 's',
 'bucket',
 'with',
 'beer',
 'Carrie',
 'Nation',
 'and',
 'her',
 'followers',
 'burst',
 'inside',
 'They',
 'assault',
 'the',
 'Irish',
 'man',
 'pulling',
 'his',
 'hat',
 'over',
 'his',
 'eyes',
 'and',
 'then',
 'dumping',
 'the',
 'beer',
 'over',
 'his',
 'head',
 'The',
 'group',
 'then',
 'begin',
 'wrecking',
 'the',
 'bar',
 'smashing',
 'the',
 'fixtures',
 'mirrors',
 'and',
 'breaking',
 'the',
 'cash',
 'register',
 'The',
 'bartender',
 'then',
 'sprays',
 'seltzer',
 'water',
 'in',
 'Nation',
 's',
 'face',
 'before',
 'a',
 'group',
 'of',
 'policemen',
 'appear',
 'and',
 'order',
 'everybody',
 'to',
 'leave']

### 2.3 Remove proper nouns and lemmatize remaining words 

In [34]:
#tagging words with parts of speech 
movie_df['Plot tagged'] = pos_tag_sents(movie_df['Plot tokenized'].tolist()) 

In [35]:
#inspecting tagged words for one plot
movie_df['Plot tagged'][0]

[('A', 'DT'),
 ('bartender', 'NN'),
 ('is', 'VBZ'),
 ('working', 'VBG'),
 ('at', 'IN'),
 ('a', 'DT'),
 ('saloon', 'NN'),
 ('serving', 'VBG'),
 ('drinks', 'NNS'),
 ('to', 'TO'),
 ('customers', 'NNS'),
 ('After', 'IN'),
 ('he', 'PRP'),
 ('fills', 'VBZ'),
 ('a', 'DT'),
 ('stereotypically', 'RB'),
 ('Irish', 'JJ'),
 ('man', 'NN'),
 ('s', 'VBZ'),
 ('bucket', 'NN'),
 ('with', 'IN'),
 ('beer', 'NN'),
 ('Carrie', 'NNP'),
 ('Nation', 'NNP'),
 ('and', 'CC'),
 ('her', 'PRP$'),
 ('followers', 'NNS'),
 ('burst', 'VBP'),
 ('inside', 'IN'),
 ('They', 'PRP'),
 ('assault', 'VBP'),
 ('the', 'DT'),
 ('Irish', 'NNP'),
 ('man', 'NN'),
 ('pulling', 'VBG'),
 ('his', 'PRP$'),
 ('hat', 'NN'),
 ('over', 'IN'),
 ('his', 'PRP$'),
 ('eyes', 'NNS'),
 ('and', 'CC'),
 ('then', 'RB'),
 ('dumping', 'VBG'),
 ('the', 'DT'),
 ('beer', 'NN'),
 ('over', 'IN'),
 ('his', 'PRP$'),
 ('head', 'NN'),
 ('The', 'DT'),
 ('group', 'NN'),
 ('then', 'RB'),
 ('begin', 'VBZ'),
 ('wrecking', 'VBG'),
 ('the', 'DT'),
 ('bar', 'NN'),
 ('smas

### Removing proper nouns 

In [36]:
#creating a mini function to remove proper nouns
def remove_pn(tagged_text):
    return [(word, tag) for word,tag in tagged_text if tag != 'NNP' and tag != 'NNPS'] #keeping tag for lemmatizing

In [37]:
#applying the remove proper nouns function
movie_df['Plot tagged no pn'] = movie_df['Plot tagged'].apply(remove_pn)

### Lemmatizing

In [48]:
#creating a function to convert post_tags to WordNet friendly tags, then lemmatizing
def lemmatize_all(tagged_text):
    doc_lemm = []
    wnl = WordNetLemmatizer()
    for word, tag in (tagged_text):
        if tag.startswith("NN"):
            doc_lemm.append(wnl.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('VB'):
            doc_lemm.append(wnl.lemmatize(word, wordnet.VERB))
        elif tag.startswith('JJ'):
            doc_lemm.append(wnl.lemmatize(word, wordnet.ADJ))
        else:
            doc_lemm.append(word)
    return doc_lemm

In [49]:
#applying the lemmatization function
movie_df['Plot lemm'] = movie_df['Plot tagged no pn'].apply(lemmatize_all)

In [50]:
#inspecting the one document to see if the proper nouns (and tags) have been removed
movie_df['Plot lemm'][0]

['A',
 'bartender',
 'be',
 'work',
 'at',
 'a',
 'saloon',
 'serve',
 'drink',
 'to',
 'customer',
 'After',
 'he',
 'fill',
 'a',
 'stereotypically',
 'Irish',
 'man',
 's',
 'bucket',
 'with',
 'beer',
 'and',
 'her',
 'follower',
 'burst',
 'inside',
 'They',
 'assault',
 'the',
 'man',
 'pull',
 'his',
 'hat',
 'over',
 'his',
 'eye',
 'and',
 'then',
 'dump',
 'the',
 'beer',
 'over',
 'his',
 'head',
 'The',
 'group',
 'then',
 'begin',
 'wreck',
 'the',
 'bar',
 'smash',
 'the',
 'fixture',
 'mirror',
 'and',
 'break',
 'the',
 'cash',
 'register',
 'The',
 'bartender',
 'then',
 'spray',
 'seltzer',
 'water',
 'in',
 's',
 'face',
 'before',
 'a',
 'group',
 'of',
 'policeman',
 'appear',
 'and',
 'order',
 'everybody',
 'to',
 'leave']

### 2.4 Final cleaning before modeling

In [51]:
#inspecting the dataframe one last time 
movie_df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Plot,Title Year,Plot tokenized,Plot tagged,Plot tagged no pn,Plot lemm
0,1901,Kansas Saloon Smashers,American,Unknown,A bartender is working at a saloon serving dr...,Kansas Saloon Smashers 1901,"[A, bartender, is, working, at, a, saloon, ser...","[(A, DT), (bartender, NN), (is, VBZ), (working...","[(A, DT), (bartender, NN), (is, VBZ), (working...","[A, bartender, be, work, at, a, saloon, serve,..."
1,1901,Love by the Light of the Moon,American,Unknown,The moon painted with a smiling face hangs ov...,Love by the Light of the Moon 1901,"[The, moon, painted, with, a, smiling, face, h...","[(The, DT), (moon, NN), (painted, VBD), (with,...","[(The, DT), (moon, NN), (painted, VBD), (with,...","[The, moon, paint, with, a, smile, face, hang,..."
2,1901,The Martyred Presidents,American,Unknown,The film just over a minute long is composed...,The Martyred Presidents 1901,"[The, film, just, over, a, minute, long, is, c...","[(The, DT), (film, NN), (just, RB), (over, IN)...","[(The, DT), (film, NN), (just, RB), (over, IN)...","[The, film, just, over, a, minute, long, be, c..."
3,1901,"Terrible Teddy, the Grizzly King",American,Unknown,Lasting just seconds and consisting of two s...,"Terrible Teddy, the Grizzly King 1901","[Lasting, just, seconds, and, consisting, of, ...","[(Lasting, VBG), (just, RB), (seconds, NNS), (...","[(Lasting, VBG), (just, RB), (seconds, NNS), (...","[Lasting, just, second, and, consist, of, two,..."
4,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter",The earliest known adaptation of the classic f...,Jack and the Beanstalk 1902,"[The, earliest, known, adaptation, of, the, cl...","[(The, DT), (earliest, JJS), (known, JJ), (ada...","[(The, DT), (earliest, JJS), (known, JJ), (ada...","[The, early, known, adaptation, of, the, class..."


In [52]:
#checking if the title year column is unique 
movie_df['Title Year'].value_counts()

Fun Down There 1988                   1
Johnny Stool Pigeon 1949              1
Alex Pandian 2013                     1
Hell on Frisco Bay 1955               1
The Strange World of Planet X 1957    1
                                     ..
 Kicking & Screaming 2005             1
Umbartha 1982                         1
The Raven 1963                        1
A Circle of Deception 1960            1
Bruce Lee and I 1976                  1
Name: Title Year, Length: 33931, dtype: int64

In [53]:
# saving the df just in case
movie_df.to_csv('movie_2.csv', index = False)

In [54]:
#dropping redundant columns 
movie_df.drop(['Plot','Plot tokenized','Plot tagged', 'Plot tagged no pn'], axis=1, inplace=True)

In [55]:
#renaming plot lemm column
movie_df.rename(columns={'Plot lemm': 'Plot modeling'}, inplace=True)

In [56]:
#moving Title Year column to first 
first_column = movie_df.pop('Title Year')
movie_df.insert(0, 'Title Year', first_column)

In [5]:
#looking at origin/ethnicity column
movie_df['Origin/Ethnicity'].value_counts()

American        17020
British          3478
Bollywood        2827
Tamil            2541
Telugu           1281
Japanese         1160
Malayalam        1070
Hong Kong         784
Canadian          716
Australian        543
South Korean      500
Chinese           434
Kannada           412
Bengali           293
Russian           229
Marathi           136
Filipino          128
Punjabi            82
Bangladeshi        80
Malaysian          70
Turkish            69
Egyptian           67
Assamese            9
Maldivian           2
Name: Origin/Ethnicity, dtype: int64

In [None]:
movie_df['Origin/Ethnicity'].replace({'South_Korean': 'South Korean'}, inplace=True)

In [57]:
#inspecting one last time...
movie_df.head()

Unnamed: 0,Title Year,Release Year,Title,Origin/Ethnicity,Director,Plot modeling
0,Kansas Saloon Smashers 1901,1901,Kansas Saloon Smashers,American,Unknown,"[A, bartender, be, work, at, a, saloon, serve,..."
1,Love by the Light of the Moon 1901,1901,Love by the Light of the Moon,American,Unknown,"[The, moon, paint, with, a, smile, face, hang,..."
2,The Martyred Presidents 1901,1901,The Martyred Presidents,American,Unknown,"[The, film, just, over, a, minute, long, be, c..."
3,"Terrible Teddy, the Grizzly King 1901",1901,"Terrible Teddy, the Grizzly King",American,Unknown,"[Lasting, just, second, and, consist, of, two,..."
4,Jack and the Beanstalk 1902,1902,Jack and the Beanstalk,American,"George S. Fleming, Edwin S. Porter","[The, early, known, adaptation, of, the, class..."


### Saving as CSV for modeling

In [6]:
movie_df.to_csv('movie_model.csv', index = False)