In [1]:
import pandas as pd
import json
from sklearn.preprocessing import MultiLabelBinarizer

#### Importing plot summaries data 

In [2]:
df_plot = pd.read_csv("plot_summaries.txt",delimiter="\t",header=None, names=["movie_id", "plot"])

In [3]:
df_plot.head()

Unnamed: 0,movie_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [6]:
#Checking the count of number of rows of data in plot summaries
df_plot.count()

movie_id    42303
plot        42303
dtype: int64

In [5]:
#checking if the movie_id column is unique in plot dataset (to be used to join with movie data)
df_plot['movie_id'].nunique()

42303

#### Importing Movie Data

In [6]:
col_names=["movie_id", "freebase_ID","name","release_date","revenue","runtime","languages","countries","genres"]

In [7]:
movie_data = pd.read_csv("movie.metadata.tsv",delimiter="\t",header=None, names=col_names)

In [8]:
movie_data.head()

Unnamed: 0,movie_id,freebase_ID,name,release_date,revenue,runtime,languages,countries,genres
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


In [9]:
#Subsetting the data to include only relevant columns
movies = movie_data[['movie_id', 'name','release_date','revenue','runtime','genres']]

In [10]:
movies.head()

Unnamed: 0,movie_id,name,release_date,revenue,runtime,genres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,Brun bitter,1988,,83.0,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,White Of The Eye,1987,,110.0,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,A Woman in Flames,1983,,106.0,"{""/m/07s9rl0"": ""Drama""}"


In [11]:
#Checking the count of number of rows of data in movies data
movies.count()

movie_id        81741
name            81741
release_date    74839
revenue          8401
runtime         61291
genres          81741
dtype: int64

In [16]:
#checking if the movie_id column is unique in movie dataset (to be used to join with plot data)
movies['movie_id'].nunique()

81741

#### Convert genres from dictionary to list

In [12]:
#convering Genres from dictionary to a list 

genres = [] 

# extract genres
for i in movies['genres']: 
    genres.append(list(json.loads(i).values()))

In [20]:
len(genres)

81741

In [13]:
# add to 'movies' dataframe  
movies['genre'] = genres

movies.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,movie_id,name,release_date,revenue,runtime,genres,genre
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...","[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,Brun bitter,1988,,83.0,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...","[Crime Fiction, Drama]"
3,9363483,White Of The Eye,1987,,110.0,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","[Thriller, Erotic thriller, Psychological thri..."
4,261236,A Woman in Flames,1983,,106.0,"{""/m/07s9rl0"": ""Drama""}",[Drama]


In [14]:
#remove genres column (genres dictionary)
movie=movies[['movie_id', 'name','release_date','revenue','runtime','genre']]

In [15]:
movie.head()

Unnamed: 0,movie_id,name,release_date,revenue,runtime,genre
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,Brun bitter,1988,,83.0,"[Crime Fiction, Drama]"
3,9363483,White Of The Eye,1987,,110.0,"[Thriller, Erotic thriller, Psychological thri..."
4,261236,A Woman in Flames,1983,,106.0,[Drama]


#### Map genres correctly

In [None]:

# stack list of sentences into different rows
# df_genre_list = pd.DataFrame.from_records(movie['genre'].tolist()).stack().reset_index(level=1, drop=True).rename('genre')
# df_movies = movie.drop('genre', axis=1).join(df_genre_list).reset_index(drop=True)[['art_id', 'Title','Year', 'art_sentences']]


In [38]:
# import genreMapping csv - prepared manually
df_genreMapping = pd.read_csv('GenreMapping.csv')
df_genreMapping.head()

Unnamed: 0,Genres,List
0,Thriller,Thriller
1,Science Fiction,"Science, Fiction"
2,Horror,Horror
3,Adventure,Adventure
4,Supernatural,Supernatural


In [42]:
def fn_convertToList(genres):
    genres = str(genres)
    list_of_genres = genres.split(',')
    #list_of_genres = [genre.strip() for genre in list_of_genres]
    return list_of_genres

In [44]:
df_genreMapping['NewList'] = df_genreMapping['List'].apply(fn_convertToList)
df_genreMapping = df_genreMapping.filter(['Genres','NewList'])
df_genreMapping.head()

Unnamed: 0,Genres,NewList
0,Thriller,[Thriller]
1,Science Fiction,"[Science, Fiction]"
2,Horror,[Horror]
3,Adventure,[Adventure]
4,Supernatural,[Supernatural]


In [45]:
movie.head()

Unnamed: 0,movie_id,name,release_date,revenue,runtime,genre
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Thriller, Science Fiction, Horror, Adventure,..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"[Mystery, Biographical film, Drama, Crime Drama]"
2,28463795,Brun bitter,1988,,83.0,"[Crime Fiction, Drama]"
3,9363483,White Of The Eye,1987,,110.0,"[Thriller, Erotic thriller, Psychological thri..."
4,261236,A Woman in Flames,1983,,106.0,[Drama]


In [79]:
# TRIAL BLOCK
new_list = []
for genre in test_genres:
    if genre in list(df_genreMapping['Genres']):
        new_list.append(df_genreMapping[df_genreMapping['Genres']==genre].iloc[0]['NewList'])
#new_list = fn_removeNestings(new_list)
flat_list = []
for sublist in new_list:
    for item in sublist:
        flat_list.append(item)
flat_list = [item.strip() for item in flat_list]
flat_list = set(flat_list)
flat_list = list(flat_list)
flat_list

['Erotica', 'Thriller', 'Psychological']

In [83]:
def fn_mapGenres(genre_list):
    new_list = []
    for genre in genre_list:
        if genre in list(df_genreMapping['Genres']):
            new_list.append(df_genreMapping[df_genreMapping['Genres']==genre].iloc[0]['NewList'])
    flat_list = []
    for sublist in new_list:
        for item in sublist:
            flat_list.append(item)
    flat_list = [item.strip() for item in flat_list]
    flat_list = set(flat_list) # removes duplicates
    flat_list = list(flat_list)
    return flat_list

In [84]:
movies['NewGenres'] = movies['genre'].apply(fn_mapGenres)
movies.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,movie_id,name,release_date,revenue,runtime,genres,genre,NewGenres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...","[Thriller, Science Fiction, Horror, Adventure,...","[Supernatural, Science, Thriller, Action, Spac..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp...","[Mystery, Biographical film, Drama, Crime Drama]","[Drama, Mystery, Crime, Biographical Film]"
2,28463795,Brun bitter,1988,,83.0,"{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D...","[Crime Fiction, Drama]","[Drama, Crime, Fiction]"
3,9363483,White Of The Eye,1987,,110.0,"{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic...","[Thriller, Erotic thriller, Psychological thri...","[Erotica, Thriller, Psychological]"
4,261236,A Woman in Flames,1983,,106.0,"{""/m/07s9rl0"": ""Drama""}",[Drama],[Drama]


In [94]:
df_movies = movies.filter(['movie_id','name','release_date','revenue','runtime','NewGenres'])
#df_movies.to_csv('MovieGenres.csv')
df_movies.head()

Unnamed: 0,movie_id,name,release_date,revenue,runtime,NewGenres
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,"[Supernatural, Science, Thriller, Action, Spac..."
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"[Drama, Mystery, Crime, Biographical Film]"
2,28463795,Brun bitter,1988,,83.0,"[Drama, Crime, Fiction]"
3,9363483,White Of The Eye,1987,,110.0,"[Erotica, Thriller, Psychological]"
4,261236,A Woman in Flames,1983,,106.0,[Drama]


#### One Hot Encode the genres

In [95]:
# one hot encode the genres column
mlb = MultiLabelBinarizer()
df_movies = df_movies.join(pd.DataFrame(mlb.fit_transform(df_movies.pop('NewGenres')),
                          columns=mlb.classes_,
                          index=df_movies.index))
df_movies.head()

Unnamed: 0,movie_id,name,release_date,revenue,runtime,Absurdism,Action,Addiction,Adult,Adventure,...,War,War Film,Western,Whodunit,Women In Prison Films,World cinema,Wuxia,Z Movie,Zombie,nan
0,975900,Ghosts of Mars,2001-08-24,14010832.0,98.0,0,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
1,3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28463795,Brun bitter,1988,,83.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9363483,White Of The Eye,1987,,110.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,261236,A Woman in Flames,1983,,106.0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
df_movies.to_csv('OneHotEncodedGenres.csv')

#### Merging the Plot Summary Data with Movie Data

In [98]:
df_movie_plot_joined = pd.merge(left=df_movies,right=df_plot, left_on='movie_id', right_on='movie_id')

In [101]:
df_movie_plot_joined.to_csv('GenresPlotMerged.csv')