In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel("C:/Users/bibek/Desktop/movie_metadata.xls", 'movie_metadata', index_col=None, na_values=['NA'])
df.dropna(axis=0, inplace=True)

In [3]:
df['profit']=df['gross']- df['budget']
df["genres"] = df["genres"].str.replace('|', ', ')

In [4]:
Y = df.profit

In [5]:
X = df.drop(['profit', 'gross', 'num_critic_for_reviews', 'num_user_for_reviews', 'movie_movie_link'], 1)


In [6]:
X.head(5)
X.isnull().sum()

color                        0
director_name                0
duration                     0
director_facebook_likes      0
actor_3_facebook_likes       0
actor_2_name                 0
actor_1_facebook_likes       0
genres                       0
actor_1_name                 0
movie_title                  0
num_voted_users              0
cast_total_facebook_likes    0
actor_3_name                 0
facenumber_in_poster         0
plot_keywords                0
language                     0
country                      0
content_rating               0
budget                       0
title_year                   0
actor_2_facebook_likes       0
movie_score                  0
aspect_ratio                 0
movie_facebook_likes         0
dtype: int64

In [7]:
for col_name in X.columns:
    if X[col_name].dtype == 'object':
        unique_cat = len(X[col_name].unique())
        print ("featurename '{col_name}' has {unique_cat} unique categories".format(
            col_name =col_name, unique_cat = unique_cat))

featurename 'color' has 2 unique categories
featurename 'director_name' has 1659 unique categories
featurename 'actor_2_name' has 2188 unique categories
featurename 'genres' has 745 unique categories
featurename 'actor_1_name' has 1428 unique categories
featurename 'movie_title' has 3655 unique categories
featurename 'actor_3_name' has 2587 unique categories
featurename 'plot_keywords' has 3656 unique categories
featurename 'language' has 34 unique categories
featurename 'country' has 45 unique categories
featurename 'content_rating' has 12 unique categories


In [8]:
todummy_list = ['color', 'director_name', 'actor_2_name', 'genres', 'actor_1_name', 'movie_title', 'actor_3_name', 'plot_keywords', 'language', 'country', 'content_rating']

In [9]:
def dummy_df(df, todummy_list):
    for x in todummy_list:
        dummies= pd.get_dummies(df[x], prefix=x)
        df = df.drop(x, 1)
        df = pd.concat([df, dummies], axis=1)
    return df

In [10]:
X = dummy_df(X, todummy_list)
print(X.head(5))

   duration  director_facebook_likes  actor_3_facebook_likes  \
0     178.0                      0.0                   855.0   
1     169.0                    563.0                  1000.0   
2     148.0                      0.0                   161.0   
3     164.0                  22000.0                 23000.0   
5     132.0                    475.0                   530.0   

   actor_1_facebook_likes  num_voted_users  cast_total_facebook_likes  \
0                  1000.0           886204                       4834   
1                 40000.0           471220                      48350   
2                 11000.0           275868                      11700   
3                 27000.0          1144337                     106759   
5                   640.0           212204                       1873   

   facenumber_in_poster       budget  title_year  actor_2_facebook_likes  ...  \
0                   0.0  237000000.0      2009.0                   936.0  ...   
1             

#Outlier Detection

In [11]:
from sklearn.preprocessing import scale
from statsmodels.nonparametric.kde import KDEUnivariate
def find_outliers_kde(x):
    x_scaled = scale(list(map(float, x)))
    kde = KDEUnivariate(x_scaled)
    kde.fit(bw="scott",fft=True)
    pred = kde.evaluate(x_scaled)
    
    n= sum(pred < 0.05)
    outlier_ind = np.asarray(pred).argsort()[:n]
    outlier_value = np.asarray(x)[outlier_ind]
    return outlier_ind, outlier_value

In [12]:
kde_indices, kde_values = find_outliers_kde(X['facenumber_in_poster'])
print(np.sort(kde_values))

[ 6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.
  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.
  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.  6.
  6.  6.  6.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.
  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  7.  8.  8.  8.
  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.
  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  8.  9.  9.  9.  9.  9.  9.  9.
  9.  9.  9.  9. 10. 10. 10. 10. 10. 10. 11. 11. 11. 11. 11. 12. 12. 12.
 13. 14. 15. 15. 15. 15. 19. 31. 43.]


In [13]:
from itertools import combinations
from sklearn.preprocessing import PolynomialFeatures

In [14]:
def add_interactions(df):
    #Get feature names
    combos = list(combinations(list(df.columns),2))
    colnames =list(df.columns) + ['_'.join(x) for x in combos]
    
    #find interactions
    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    df = poly.fit_transformation(df)
    df = pd.DataFrame(df)
    df.columns = colnames
    
    #Remove interaction terms with all 0 values
    noint_indicies = [i for i, x in enumerate(list((df == 0).all())) if x]
    df = df.drop(df.columns[noint_indicies], axis=1)
    
    return df

In [15]:
X = add_interactions(X)

AttributeError: 'PolynomialFeatures' object has no attribute 'fit_transformation'

#Decomposition of features

In [16]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_pca = pd.DataFrame(pca.fit_transform(X))

In [17]:
print(X_pca.head(5))

             0             1             2             3             4   \
0  1.907632e+08  7.705416e+05 -43136.913399 -28956.290660   3516.728296   
1  2.537632e+08  3.539054e+05  29754.916856 -41368.148234   3904.178059   
2  1.987632e+08  1.662034e+05   3767.874067  62368.552144   7033.448961   
3  2.037632e+08  1.040981e+06  59241.498253  71440.000041 -34392.815652   
4  2.174632e+08  9.692002e+04 -15256.525050   8467.662695   1676.542882   

             5             6            7          8          9   ...  \
0  -5212.661423    109.101792   322.163163  32.876996 -14.335437  ...   
1  -2535.808542    545.538711   464.263912  41.124273 -11.898920  ...   
2  -1714.659592    965.387549  -190.265472  27.084746  -2.968569  ...   
3  13890.605187 -20932.025564  7702.693412 -22.116868  14.221902  ...   
4   -851.291368    239.623677  -330.543053  15.657214  -9.445633  ...   

         40        41        42        43        44        45        46  \
0 -0.081925 -0.106588 -0.027081 -0.

#Model building and feature selection 

In [18]:
Y = np.array([Y]).T 

from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.20, random_state=1)

In [19]:
import sklearn.feature_selection
select = sklearn.feature_selection.SelectKBest(k=20)
selected_features = select.fit(X_train, Y_train)
indices_selected = selected_features.get_support(indices=True)
colnames_selected = [X.columns[i] for i in indices_selected]

X_train_selected = X_train[colnames_selected]
X_test_selected =  X_test[colnames_selected]

  y = column_or_1d(y, warn=True)
  f = msb / msw
  f = msb / msw


In [20]:
colnames_selected

['country_Norway',
 'country_Official site',
 'country_Peru',
 'country_Romania',
 'country_Russia',
 'country_South Africa',
 'country_South Korea',
 'country_Spain',
 'country_Taiwan',
 'country_Thailand',
 'country_West Germany',
 'content_rating_G',
 'content_rating_GP',
 'content_rating_M',
 'content_rating_NC-17',
 'content_rating_Not Rated',
 'content_rating_PG-13',
 'content_rating_Passed',
 'content_rating_Unrated',
 'content_rating_X']