In [193]:
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import linear_model
from sklearn.preprocessing import StandardScaler

In [194]:
## importing the csv file

data = pd.read_csv('IMDB-Movie-Data.csv')


## cleaning null values

data = data[data['Revenue (Millions)'].notna()]
data = data[data['Metascore'].notna()]
data = data[data['Rating'].notna()]
data = data[data['Votes'].notna()]

## drop useless column

data.drop('Description', axis='columns', inplace=True)


## sampling the population

data_sample = data.sample(200)

data_sample.head()

Unnamed: 0,Rank,Title,Genre,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
856,857,Terminator Salvation,"Action,Adventure,Drama",McG,"Christian Bale, Sam Worthington, Anton Yelchin...",2009,115,6.6,297093,125.32,49.0
125,126,The Jungle Book,"Adventure,Drama,Family",Jon Favreau,"Neel Sethi, Bill Murray, Ben Kingsley, Idris Elba",2016,106,7.5,198243,364.0,77.0
373,374,Wanted,"Action,Crime,Fantasy",Timur Bekmambetov,"Angelina Jolie, James McAvoy, Morgan Freeman, ...",2008,110,6.7,312495,134.57,64.0
690,691,Resident Evil: Retribution,"Action,Horror,Sci-Fi",Paul W.S. Anderson,"Milla Jovovich, Sienna Guillory, Michelle Rodr...",2012,96,5.4,114144,42.35,39.0
748,749,Ouija: Origin of Evil,"Horror,Thriller",Mike Flanagan,"Elizabeth Reaser, Lulu Wilson, Annalise Basso,...",2016,99,6.1,30035,34.9,65.0


In [195]:
### transorming the rows Actors and Genre into colums

data_sample[['Actor_1', 'Actor_2', 'Actor_3', 'Actor_4']] = data_sample['Actors'].str.split(',', expand=True)

data_sample[['Genre_1', 'Genre_2', 'Genre_3', ]] = data_sample['Genre'].str.split(',', expand=True)

data_sample2 = data_sample[['Rank', 'Title', 'Genre_1', 'Genre_2', 'Genre_3', 'Actor_1', 'Actor_2', 'Actor_3', 'Actor_4', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore' ]]

data_sample2.head()

Unnamed: 0,Rank,Title,Genre_1,Genre_2,Genre_3,Actor_1,Actor_2,Actor_3,Actor_4,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
856,857,Terminator Salvation,Action,Adventure,Drama,Christian Bale,Sam Worthington,Anton Yelchin,Moon Bloodgood,2009,115,6.6,297093,125.32,49.0
125,126,The Jungle Book,Adventure,Drama,Family,Neel Sethi,Bill Murray,Ben Kingsley,Idris Elba,2016,106,7.5,198243,364.0,77.0
373,374,Wanted,Action,Crime,Fantasy,Angelina Jolie,James McAvoy,Morgan Freeman,Terence Stamp,2008,110,6.7,312495,134.57,64.0
690,691,Resident Evil: Retribution,Action,Horror,Sci-Fi,Milla Jovovich,Sienna Guillory,Michelle Rodriguez,Aryana Engineer,2012,96,5.4,114144,42.35,39.0
748,749,Ouija: Origin of Evil,Horror,Thriller,,Elizabeth Reaser,Lulu Wilson,Annalise Basso,Henry Thomas,2016,99,6.1,30035,34.9,65.0


In [196]:
## melting for actors for a better data

verticalized = pd.melt(data_sample2, id_vars = ['Rank', 'Title', 'Genre_1', 'Genre_2', 'Genre_3', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore'])

verticalized.head()

Unnamed: 0,Rank,Title,Genre_1,Genre_2,Genre_3,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,variable,value
0,857,Terminator Salvation,Action,Adventure,Drama,2009,115,6.6,297093,125.32,49.0,Actor_1,Christian Bale
1,126,The Jungle Book,Adventure,Drama,Family,2016,106,7.5,198243,364.0,77.0,Actor_1,Neel Sethi
2,374,Wanted,Action,Crime,Fantasy,2008,110,6.7,312495,134.57,64.0,Actor_1,Angelina Jolie
3,691,Resident Evil: Retribution,Action,Horror,Sci-Fi,2012,96,5.4,114144,42.35,39.0,Actor_1,Milla Jovovich
4,749,Ouija: Origin of Evil,Horror,Thriller,,2016,99,6.1,30035,34.9,65.0,Actor_1,Elizabeth Reaser


In [197]:
verticalized.drop('variable', axis='columns', inplace=True)

verticalized.rename(columns = {'value' : 'Actors'},  inplace=True)

verticalized.head()

Unnamed: 0,Rank,Title,Genre_1,Genre_2,Genre_3,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Actors
0,857,Terminator Salvation,Action,Adventure,Drama,2009,115,6.6,297093,125.32,49.0,Christian Bale
1,126,The Jungle Book,Adventure,Drama,Family,2016,106,7.5,198243,364.0,77.0,Neel Sethi
2,374,Wanted,Action,Crime,Fantasy,2008,110,6.7,312495,134.57,64.0,Angelina Jolie
3,691,Resident Evil: Retribution,Action,Horror,Sci-Fi,2012,96,5.4,114144,42.35,39.0,Milla Jovovich
4,749,Ouija: Origin of Evil,Horror,Thriller,,2016,99,6.1,30035,34.9,65.0,Elizabeth Reaser


In [198]:
## melting for genres for a better data

verticalized2 = pd.melt(verticalized, id_vars = ['Rank', 'Title', 'Year', 'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)', 'Metascore', 'Actors'])

verticalized2.head()

Unnamed: 0,Rank,Title,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Actors,variable,value
0,857,Terminator Salvation,2009,115,6.6,297093,125.32,49.0,Christian Bale,Genre_1,Action
1,126,The Jungle Book,2016,106,7.5,198243,364.0,77.0,Neel Sethi,Genre_1,Adventure
2,374,Wanted,2008,110,6.7,312495,134.57,64.0,Angelina Jolie,Genre_1,Action
3,691,Resident Evil: Retribution,2012,96,5.4,114144,42.35,39.0,Milla Jovovich,Genre_1,Action
4,749,Ouija: Origin of Evil,2016,99,6.1,30035,34.9,65.0,Elizabeth Reaser,Genre_1,Horror


In [199]:
verticalized2.drop('variable', axis='columns', inplace=True)

verticalized2.rename(columns = {'value' : 'Genre'},  inplace=True)

verticalized2.head()




Unnamed: 0,Rank,Title,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Actors,Genre
0,857,Terminator Salvation,2009,115,6.6,297093,125.32,49.0,Christian Bale,Action
1,126,The Jungle Book,2016,106,7.5,198243,364.0,77.0,Neel Sethi,Adventure
2,374,Wanted,2008,110,6.7,312495,134.57,64.0,Angelina Jolie,Action
3,691,Resident Evil: Retribution,2012,96,5.4,114144,42.35,39.0,Milla Jovovich,Action
4,749,Ouija: Origin of Evil,2016,99,6.1,30035,34.9,65.0,Elizabeth Reaser,Horror


In [200]:
verticalized2 = verticalized2[verticalized2['Revenue (Millions)'].notna()]
verticalized2 = verticalized2[verticalized2['Metascore'].notna()]
verticalized2 = verticalized2[verticalized2['Rating'].notna()]
verticalized2 = verticalized2[verticalized2['Votes'].notna()]
verticalized2 = verticalized2[verticalized2['Actors'].notna()]
verticalized2 = verticalized2[verticalized2['Genre'].notna()]
verticalized2 = verticalized2[verticalized2['Runtime (Minutes)'].notna()]

In [201]:
#verticalized2.to_csv('IMBD_project.csv')

#Adventure = verticalized2[verticalized2['Genre'] == 'Adventure']['Revenue (Millions)']
#Adventure.mean()

verticalized2['Genre'].unique()

array(['Action', 'Adventure', 'Horror', 'Animation', 'Comedy', 'Mystery',
       'Drama', 'Sci-Fi', 'Biography', 'Crime', 'Thriller', 'Fantasy',
       'Family', 'Romance', 'War', 'Music', 'Sport', 'History'],
      dtype=object)

# MULTINLINEAR REGRESSION

In [202]:


## checking correlation in between clomuns

print('The correlation between Revenue (Millions) and Rating:', verticalized2['Revenue (Millions)'].corr(verticalized2['Rating']))
print('The correlation between Revenue (Millions) and Metascore:' , verticalized2['Revenue (Millions)'].corr(verticalized2['Metascore']))
print('The correlation between Revenue (Millions) and Number of Votes:', verticalized2['Revenue (Millions)'].corr(verticalized2['Votes']))
print('The correlation between Revenue (Millions) and Rank:' , verticalized2['Revenue (Millions)'].corr(verticalized2['Rank']))

The correlation between Revenue (Millions) and Rating: 0.31149160298357415
The correlation between Revenue (Millions) and Metascore: 0.3095077325626471
The correlation between Revenue (Millions) and Number of Votes: 0.68712142452722
The correlation between Revenue (Millions) and Rank: -0.2965994389821227


In [203]:
# Multilinear regression:
## I want to see how our data are correlated : the revenue based on rating, metascore and runtime


X = data_sample2[['Rating','Metascore', 'Runtime (Minutes)', 'Rank']]
y = data_sample2['Revenue (Millions)']

model = linear_model.LinearRegression()
result = model.fit(X,y)

R2 = result.score(X,y)

print('R^2 (not normalized) is:', R2)

R^2 (not normalized) is: 0.19638647048360125


In [204]:
# Multilinear regression:
# Let's Try to normalize the datas:




import sklearn
from sklearn.preprocessing import MinMaxScaler

normalizer = MinMaxScaler()
normalized = normalizer.fit_transform(data_sample2[['Rating','Metascore']])#'Runtime (Minutes)','Rank']])



X_normalized =pd.DataFrame(normalized, columns=['Rating','Metascore'])#'Runtime (Minutes)','Rank'])




In [205]:

model = linear_model.LinearRegression()
result = model.fit(X_normalized,y)
R2_normalized = result.score(X_normalized,y)

print('R^2 (normalized) is:', R2_normalized)

R^2 (normalized) is: 0.10945993085875727


# HYPOTESIS TESTING

In [206]:
##### I wanto to Test following Hypotesis:
##### H0: mu[genre](Revenue (Millions)) = mu[genre](Revenue (Millions))
##### H1: mu[genre](Revenue (Millions)) != mu[genre](Revenue (Millions))
##### sl = 0.05


## I define a function that can calculate stats and p_value and test my H0:

def hypothesis_1 (genre1, genre2, sl= 0.05):
    sl = sl
    for genre in verticalized2['Genre']:
        type1 = verticalized2[verticalized2['Genre'] == genre1]['Revenue (Millions)']
        type2 = verticalized2[verticalized2['Genre'] == genre2]['Revenue (Millions)']
    stat, p_value = st.ttest_ind(type1, type2,  equal_var=False)
    return p_value < sl




In [207]:
### I created a loop to make confront all possible combination

dic = {}

for x in verticalized2['Genre'].unique():
    for y in verticalized2['Genre'].unique():
        if x == y:
            continue
        else:
            key = x + ' with ' + y
            dic[key] = hypothesis_1(x,y)

In [253]:
##### H0: mu[genre](Rating)) = mu[genre](Rating)
##### H1: mu[genre](Rating) != mu[genre](Rating)
##### sl = 0.05


def hypothesis_2 (genre1, genre2, sl= 0.05):
    sl = sl
    for genre in verticalized2['Genre']:
        type1 = verticalized2[verticalized2['Genre'] == genre1]['Rating']
        type2 = verticalized2[verticalized2['Genre'] == genre2]['Rating']
    stat, p_value = st.ttest_ind(type1, type2,  equal_var=False)
    return p_value < sl

In [254]:
dic2 = {}

for x in verticalized2['Genre'].unique():
    for y in verticalized2['Genre'].unique():
        if x == y:
            continue
        else:
            key = x + ' with ' + y
            dic2[key] = hypothesis_2(x,y)

In [250]:
##### H0: mu[genre](Metascore)) = mu[genre](Metascore)
##### H1: mu[genre](Metascore) != mu[genre](Metascore)
##### sl = 0.05


def hypothesis_3 (genre1, genre2, sl= 0.05):
    sl = sl
    for genre in verticalized2['Genre']:
        type1 = verticalized2[verticalized2['Genre'] == genre1]['Metascore']
        type2 = verticalized2[verticalized2['Genre'] == genre2]['Metascore']
    stat, p_value = st.ttest_ind(type1, type2,  equal_var=False)
    return p_value < sl

In [251]:
dic3 = {}

for x in verticalized2['Genre'].unique():
    for y in verticalized2['Genre'].unique():
        if x == y:
            continue
        else:
            key = x + ' with ' + y
            dic3[key] = hypothesis_3(x,y)

In [210]:
dic

{'Action with Adventure': True,
 'Action with Horror': True,
 'Action with Animation': False,
 'Action with Comedy': True,
 'Action with Mystery': True,
 'Action with Drama': True,
 'Action with Sci-Fi': False,
 'Action with Biography': True,
 'Action with Crime': True,
 'Action with Thriller': True,
 'Action with Fantasy': False,
 'Action with Family': False,
 'Action with Romance': True,
 'Action with War': True,
 'Action with Music': True,
 'Action with Sport': True,
 'Action with History': True,
 'Adventure with Action': True,
 'Adventure with Horror': True,
 'Adventure with Animation': False,
 'Adventure with Comedy': True,
 'Adventure with Mystery': True,
 'Adventure with Drama': True,
 'Adventure with Sci-Fi': False,
 'Adventure with Biography': True,
 'Adventure with Crime': True,
 'Adventure with Thriller': True,
 'Adventure with Fantasy': False,
 'Adventure with Family': True,
 'Adventure with Romance': True,
 'Adventure with War': True,
 'Adventure with Music': True,
 'Adven

In [255]:
dic2

{'Action with Adventure': True,
 'Action with Horror': True,
 'Action with Animation': True,
 'Action with Comedy': False,
 'Action with Mystery': False,
 'Action with Drama': True,
 'Action with Sci-Fi': False,
 'Action with Biography': True,
 'Action with Crime': True,
 'Action with Thriller': False,
 'Action with Fantasy': False,
 'Action with Family': False,
 'Action with Romance': False,
 'Action with War': True,
 'Action with Music': False,
 'Action with Sport': True,
 'Action with History': True,
 'Adventure with Action': True,
 'Adventure with Horror': True,
 'Adventure with Animation': True,
 'Adventure with Comedy': True,
 'Adventure with Mystery': False,
 'Adventure with Drama': False,
 'Adventure with Sci-Fi': False,
 'Adventure with Biography': True,
 'Adventure with Crime': False,
 'Adventure with Thriller': False,
 'Adventure with Fantasy': False,
 'Adventure with Family': False,
 'Adventure with Romance': True,
 'Adventure with War': False,
 'Adventure with Music': Fals

In [252]:
dic3

{'Action with Adventure': True,
 'Action with Horror': False,
 'Action with Animation': True,
 'Action with Comedy': False,
 'Action with Mystery': False,
 'Action with Drama': True,
 'Action with Sci-Fi': True,
 'Action with Biography': True,
 'Action with Crime': True,
 'Action with Thriller': True,
 'Action with Fantasy': True,
 'Action with Family': True,
 'Action with Romance': False,
 'Action with War': False,
 'Action with Music': True,
 'Action with Sport': False,
 'Action with History': True,
 'Adventure with Action': True,
 'Adventure with Horror': True,
 'Adventure with Animation': True,
 'Adventure with Comedy': True,
 'Adventure with Mystery': True,
 'Adventure with Drama': False,
 'Adventure with Sci-Fi': False,
 'Adventure with Biography': True,
 'Adventure with Crime': False,
 'Adventure with Thriller': False,
 'Adventure with Fantasy': False,
 'Adventure with Family': False,
 'Adventure with Romance': True,
 'Adventure with War': False,
 'Adventure with Music': True,
 

270

110

In [264]:
### create new dataframe for analysis

product1 = pd.DataFrame(list(dic.items()), columns=['Genre', 'Df_Revenue'])
product2 = pd.DataFrame(list(dic2.items()), columns=['Genre', 'Df_Rating'])
product3 = pd.DataFrame(list(dic3.items()), columns=['Genre', 'Df_Metascore'])

product1[['Genre_1', 'Genre_2']] = product1['Genre'].str.split(' with ', expand=True)

product2[['Genre_1', 'Genre_2']] = product2['Genre'].str.split(' with ', expand=True)

product3[['Genre_1', 'Genre_2']] = product3['Genre'].str.split(' with ', expand=True)


product1.drop('Genre', axis='columns', inplace=True)
product2.drop('Genre', axis='columns', inplace=True)
product3.drop('Genre', axis='columns', inplace=True)

product1 = product1[['Genre_1', 'Genre_2', 'Df_Revenue']]
product2 = product2[['Genre_1', 'Genre_2', 'Df_Rating']]
product3 = product3[['Genre_1', 'Genre_2', 'Df_Metascore']]
                    
display(product1)

display(product2)

display(product3)

Unnamed: 0,Genre_1,Genre_2,Df_Revenue
0,Action,Adventure,True
1,Action,Horror,True
2,Action,Animation,False
3,Action,Comedy,True
4,Action,Mystery,True
...,...,...,...
301,History,Family,True
302,History,Romance,False
303,History,War,False
304,History,Music,False


Unnamed: 0,Genre_1,Genre_2,Df_Rating
0,Action,Adventure,True
1,Action,Horror,True
2,Action,Animation,True
3,Action,Comedy,False
4,Action,Mystery,False
...,...,...,...
301,History,Family,True
302,History,Romance,True
303,History,War,False
304,History,Music,False


Unnamed: 0,Genre_1,Genre_2,Df_Metascore
0,Action,Adventure,True
1,Action,Horror,False
2,Action,Animation,True
3,Action,Comedy,False
4,Action,Mystery,False
...,...,...,...
301,History,Family,False
302,History,Romance,True
303,History,War,True
304,History,Music,True


In [221]:
#product2.drop('Genre_1', axis='columns', inplace=True)
#product2.drop('Genre_2', axis='columns', inplace=True)

In [265]:
product_ = pd.merge(product1, product2 ,how= 'inner')

product_

Unnamed: 0,Genre_1,Genre_2,Df_Revenue,Df_Rating
0,Action,Adventure,True,True
1,Action,Horror,True,True
2,Action,Animation,False,True
3,Action,Comedy,True,False
4,Action,Mystery,True,False
...,...,...,...,...
301,History,Family,True,True
302,History,Romance,False,True
303,History,War,False,False
304,History,Music,False,False


In [266]:
product = pd.merge(product_, product3 ,how= 'inner')

product

Unnamed: 0,Genre_1,Genre_2,Df_Revenue,Df_Rating,Df_Metascore
0,Action,Adventure,True,True,True
1,Action,Horror,True,True,False
2,Action,Animation,False,True,True
3,Action,Comedy,True,False,False
4,Action,Mystery,True,False,False
...,...,...,...,...,...
301,History,Family,True,True,False
302,History,Romance,False,True,True
303,History,War,False,False,True
304,History,Music,False,False,True


In [267]:
### creating new columns for mean

result = []


for i in product['Genre_1']:
    means = verticalized2[verticalized2['Genre'] == i]['Revenue (Millions)'].mean()
    result.append(means)

    
product['G1_Revenue'] = result

result2 = []

for i in product['Genre_2']:
    means2 = verticalized2[verticalized2['Genre'] == i]['Revenue (Millions)'].mean()
    result2.append(means2)
    
product['G2_Revenue'] = result2   







In [268]:
result = []


for i in product['Genre_1']:
    means = verticalized2[verticalized2['Genre'] == i]['Rating'].mean()
    result.append(means)

    
product['G1_Rating'] = result

result2 = []

for i in product['Genre_2']:
    means2 = verticalized2[verticalized2['Genre'] == i]['Rating'].mean()
    result2.append(means2)
    
product['G2_Rating'] = result2   



In [269]:
result = []


for i in product['Genre_1']:
    means = verticalized2[verticalized2['Genre'] == i]['Metascore'].mean()
    result.append(means)

    
product['G1_Metascore'] = result

result2 = []

for i in product['Genre_2']:
    means2 = verticalized2[verticalized2['Genre'] == i]['Metascore'].mean()
    result2.append(means2)
    
product['G2_Metascore'] = result2   

 

product

Unnamed: 0,Genre_1,Genre_2,Df_Revenue,Df_Rating,Df_Metascore,G1_Revenue,G2_Revenue,G1_Rating,G2_Rating,G1_Metascore,G2_Metascore
0,Action,Adventure,True,True,True,142.158406,188.915714,6.784058,7.051786,56.014493,61.803571
1,Action,Horror,True,True,False,142.158406,48.923889,6.784058,6.288889,56.014493,57.444444
2,Action,Animation,False,True,True,142.158406,172.068000,6.784058,7.710000,56.014493,76.900000
3,Action,Comedy,True,False,False,142.158406,72.888542,6.784058,6.772917,56.014493,58.562500
4,Action,Mystery,True,False,False,142.158406,81.683571,6.784058,6.864286,56.014493,55.642857
...,...,...,...,...,...,...,...,...,...,...,...
301,History,Family,True,True,False,32.812857,135.598182,7.442857,7.000000,68.571429,63.909091
302,History,Romance,False,True,True,32.812857,38.498621,7.442857,6.772414,68.571429,54.551724
303,History,War,False,False,True,32.812857,43.756667,7.442857,7.466667,68.571429,59.000000
304,History,Music,False,False,True,32.812857,54.972500,7.442857,7.175000,68.571429,47.250000


In [270]:
product.head()

Unnamed: 0,Genre_1,Genre_2,Df_Revenue,Df_Rating,Df_Metascore,G1_Revenue,G2_Revenue,G1_Rating,G2_Rating,G1_Metascore,G2_Metascore
0,Action,Adventure,True,True,True,142.158406,188.915714,6.784058,7.051786,56.014493,61.803571
1,Action,Horror,True,True,False,142.158406,48.923889,6.784058,6.288889,56.014493,57.444444
2,Action,Animation,False,True,True,142.158406,172.068,6.784058,7.71,56.014493,76.9
3,Action,Comedy,True,False,False,142.158406,72.888542,6.784058,6.772917,56.014493,58.5625
4,Action,Mystery,True,False,False,142.158406,81.683571,6.784058,6.864286,56.014493,55.642857


In [None]:
list_of_genre =['Action', 'Adventure', 'Horror', 'Animation', 'Comedy', 'Mystery',
       'Drama', 'Sci-Fi', 'Biography', 'Crime', 'Thriller', 'Fantasy',
       'Family', 'Romance', 'War', 'Music', 'Sport', 'History']



# 
#def function():
gen1 = input('genere1:')
gen2 = input('genere2:')
interest = input('what do you wanto to compare?:')

choice = product[(product['Genre_1'] == gen1) & (product['Genre_2'] == gen2)][['Genre_1', 'Genre_2','Df_'+interest,'G1_'+ interest, 'G2_'+ interest]]


3    166.072464
dtype: float64
