This is a separate file to work on my word frequency of top movie synopses.  
Desired Result: The end goal is an application that will estimate how closely a movie script summary matches the key words of succesful movies.  In other words, attempts to use the "magic movie formula' with text analysis.
Challenges: This CSV file does not contain financial data or other measures of "success", nor does it contain data fields that are easily mappable to the other databases.  

In [47]:
# importing packages
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sqlite3
import os
import zlib
import zipfile
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from pathlib import Path  
import glob
from matplotlib import font_manager

In [48]:
#This is a function to find the number of missing values per column in a dataframe
def missing_values(df_name):
    """This function iterates to find the number of missing values per column"""
    print ("Missing values by Column")
    for col in df_name.columns:
        missing = df_name[col].isna().sum()
        print (str(col) + ": " + str(missing) + " missing values")
    return

In [49]:
#3) CSV - MOVIE INFO

# This is the CSV with movie information with columns: id, synopsis, rating, genre, director, writer, theater_date, dvd_date, currency, box_office, runtime, studio.
# ID has type int64, all others are mixed type objects and may need cleaning. 
movie_info = pd.read_csv('/home/bringingthesparkle/FlatIron/Movie_Data/rt.movie_info.tsv', sep='\t')
print(movie_info.head(2))

#print(movie_info.dtypes)
#print(movie_info.shape) # Shape is (1560 by 12)

# Assert that 'id' is a unique record and there are no missing values
assert movie_info["id"].isna().sum() == 0

#Using missing values function to count NaNs per column
missing_values(movie_info)

   id   
0   1  \
1   3   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            

In [53]:
# Replace NaNs in synopsis columns with word 'None' and assert that none are left.  I am not using a longer replacement phrase since I am using word counts.  
movie_info.replace(np.nan, 'None', inplace=True)
assert movie_info['synopsis'].isna().sum() == 0

# Allow user to input a movie synopsis
#test_set = [input("Please enter the movie synopsis you would like to analyze:")]
test_set = ['When a young woman is killed by a shark while skinny-dipping near the New England tourist town of Amity Island, police chief Martin Brody (Roy Scheider) wants to close the beaches, but mayor Larry Vaughn (Murray Hamilton) overrules him, fearing that the loss of tourist revenue will cripple the town. Ichthyologist Matt Hooper (Richard Dreyfuss) and grizzled ship captain Quint (Robert Shaw) offer to help Brody capture the killer beast, and the trio engage in an epic battle of man vs. nature.']
#test_set = ['Arcade-game character Wreck-It Ralph (John C. Reilly) is tired of always being the "bad guy" and losing to his "good guy" opponent, Fix-It Felix (Jack McBrayer). Finally, after decades of seeing all the glory go to Felix, Ralph decides to take matters into his own hands. He sets off on a game-hopping trip to prove that he has what it takes to be a hero. However, while on his quest, Ralph accidentally unleashes a deadly enemy that threatens the entire arcade.']

#Initialize index and new dict
index = 0
df_row = movie_info.iloc[index]
db_syn = pd.Series([df_row['synopsis']])
counts = {}
orig_df_len = len(movie_info)
#print(type(df_row))
#print(type(db_syn))


while index < len(movie_info):
    
    # Get the Dataframe object for the row index
    df_row = movie_info.iloc[index]
    # Get the Pandas Series of the Synopsis Column
    db_syn = pd.Series([df_row['synopsis']])
    #Use Count_Vectorizer to get a vocab list and word count
    count_vectorizer = CountVectorizer()
    count_vectorizer.fit_transform(db_syn)
    Vocabu = list(count_vectorizer.vocabulary_)
    # Turn the word count array into an array
    freq_term_matrix = count_vectorizer.transform(test_set)
    count_array = freq_term_matrix.toarray()
    # And now into a dataframe
    df = pd.DataFrame(data=count_array, columns=Vocabu)

    #Add "score" to new dict, indicating it may be "like" the other movie.
    raw_score = np.sum(freq_term_matrix)
    counts[index] = raw_score
    index +=1


# Turn k,v pair dict into a df then sort, allow user to iterate through as many possible matches as they want.

new_df = pd.DataFrame(counts.items(), columns=['Movie_Info_Index', 'Raw_Score'])
new_df.sort_values(by='Raw_Score', ascending=False, inplace=True)
new_df.reset_index(drop=True, inplace=True)
print(new_df.head())


iterator=0
cont = 'y'
while cont == 'y':
    mi_index = new_df.Movie_Info_Index[iterator]
    print(mi_index)
     #Reset the maxcolwidth to default
    pd.set_option('display.max_colwidth', None)
    top_syn_match = movie_info.synopsis[mi_index]
    print(top_syn_match)
    detail = input('Would like to see additional details about this match (y/n): ')
    if detail == 'y':
        #Reset the maxcolwidth to default
        pd.set_option('display.max_colwidth', 50)
        print(movie_info.iloc[[mi_index]].T)
    cont = input('Would you like to see another possible match (y/n):')
    iterator +=1
    #cont = 'n'


   Movie_Info_Index  Raw_Score
0              1471         44
1                58         30
2               804         29
3               497         29
4              1408         28
1471
                                                           1471
id                                                         1892
synopsis      Based on Peter Benchley's best-selling novel, ...
rating                                                       PG
genre           Action and Adventure|Drama|Mystery and Suspense
director                                       Steven Spielberg
writer              Howard Sackler|Carl Gottlieb|Peter Benchley
theater_date                                        Jun 1, 1975
dvd_date                                           Jul 11, 2000
currency                                                   None
box_office                                                 None
runtime                                             124 minutes
studio                                   

In [5]:
#Reset the maxcolwidth to default
pd.set_option('display.max_colwidth', 50)
number = int(input("Please enter the number of the match you would like to view"))
print(movie_info.iloc[number])

id                                                            648
synopsis        Set in Texas during the late 1860s, Rio Bravo ...
rating                                                          R
genre                       Action and Adventure|Classics|Western
director                                             Howard Hawks
writer                              Jules Furthman|Leigh Brackett
theater_date                                          Apr 4, 1959
dvd_date                                              May 8, 2001
currency                                                     None
box_office                                                   None
runtime                                               142 minutes
studio                                                       None
Name: 497, dtype: object


In [6]:
#Removed Code sections
"""
mini_df = movie_info.iloc[3:5]
# Get the Pandas Series of the Synopsis Column
syns = mini_df['synopsis'] 
print(type(syns))
print(syns)
success_set = syns
count_vectorizer = CountVectorizer()
count_vectorizer.fit_transform(syns)
print ("Vocabulary:")
print(count_vectorizer.vocabulary_)
#Get just the list of words for matching to array columns later
Vocab = list(count_vectorizer.vocabulary_)
print(Vocab)
# Turn the word count array into an array
freq_term_matrix = count_vectorizer.transform(test_set)
print(type(freq_term_matrix.todense()))
print (freq_term_matrix.todense())

count_array = freq_term_matrix.toarray()
df = pd.DataFrame(data=count_array, columns=Vocab)
print(df)

#find the row with the highest "score", indicating it may be "like" the other movie.
print(np.sum(freq_term_matrix)) # This gives an overall score 

-------
#get the key (index) with the highest value
#print(counts)
#v = list(counts.values())
#print(v)
#k = list(counts.keys())
#print(k)
#max_val = k[v.index(max(v))]
#print(max_val)
--------------------

l = list(v)
l.sort(reverse=True)
print(l)
top_match_val = l[:3]
print((top_match_val))
top_match_key = []
for val in top_match_val: 
    top_match_key.append(k[v.index(val)])
#print (top_match_key)
# I had to change the col_widths to make the text viewable.
pd.set_option('display.max_colwidth', None)
print ('These are the top three matches')
for key_id in top_match_key:
    suggested = movie_info['synopsis'].iloc[[key_id]]
    print(suggested)

----------

#from sklearn.feature_extraction.text import TfidfTransformer
#tfidf = TfidfTransformer(norm="l2")
#tfidf.fit(freq_term_matrix)
#print ("IDF:")
#print(tfidf.idf_)"""

'\nmini_df = movie_info.iloc[3:5]\n# Get the Pandas Series of the Synopsis Column\nsyns = mini_df[\'synopsis\'] \nprint(type(syns))\nprint(syns)\nsuccess_set = syns\ncount_vectorizer = CountVectorizer()\ncount_vectorizer.fit_transform(syns)\nprint ("Vocabulary:")\nprint(count_vectorizer.vocabulary_)\n#Get just the list of words for matching to array columns later\nVocab = list(count_vectorizer.vocabulary_)\nprint(Vocab)\n# Turn the word count array into an array\nfreq_term_matrix = count_vectorizer.transform(test_set)\nprint(type(freq_term_matrix.todense()))\nprint (freq_term_matrix.todense())\n\ncount_array = freq_term_matrix.toarray()\ndf = pd.DataFrame(data=count_array, columns=Vocab)\nprint(df)\n\n#find the row with the highest "score", indicating it may be "like" the other movie.\nprint(np.sum(freq_term_matrix)) # This gives an overall score \n\n\n\n#from sklearn.feature_extraction.text import TfidfTransformer\n#tfidf = TfidfTransformer(norm="l2")\n#tfidf.fit(freq_term_matrix)\n#p