In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.impute import SimpleImputer

In [2]:
data = pd.read_csv(r'final_cleaned_preproecessed_file.csv')

In [3]:
overview = data.loc[:, ['id','overview']]
data = data.drop(['id', 'overview', 'IMDB Score','revenue', 'vote_average', 'vote_count','production_company_2','popularity',], axis = 1)

In [4]:
for i,j in enumerate(data.columns) : 
    print('{} : {}'.format(i,j))

0 : belongs_to_collection
1 : budget
2 : runtime
3 : num_faces
4 : brightness
5 : saturation
6 : hue
7 : brightness_sd
8 : saturation_sd 
9 : hue_sd
10 : blue
11 : blue_sd
12 : green
13 : green_sd
14 : red
15 : red_sd 
16 : Action
17 : Adventure
18 : Animation
19 : Comedy
20 : Crime
21 : Documentary
22 : Drama
23 : Family
24 : Fantasy
25 : Foreign
26 : History
27 : Horror
28 : Music
29 : Mystery
30 : Romance
31 : Science Fiction
32 : TV Movie
33 : Thriller
34 : War
35 : Western
36 : production_company_1
37 : Lead_Actor
38 : Supporting_Actor
39 : Director
40 : Producer
41 : rating


In [5]:
data[data.budget == 0].shape

(4637, 42)

In [6]:
data[data.runtime == 0].shape

(131, 42)

In [7]:
data[data.Lead_Actor == '0'].shape

(521, 42)

In [8]:
data[data.Supporting_Actor == '0'].shape

(392, 42)

In [9]:
data[data.Director == '0'].shape

(56, 42)

In [10]:
data[data.Producer == '0'].shape

(2506, 42)

### Imputing values for Numerical Variable - (runtime )

In [11]:
imputer_runtime = SimpleImputer(missing_values=0, strategy = 'mean')

data.loc[:,['runtime']] = imputer_runtime.fit_transform(data.loc[:,['runtime']])

### Imputing values for Categorical Variables - 
'production_company_1', 'production_company_2', 'Lead_Actor', 'Supporting_Actor', 'Director',
       'Producer'

In [12]:
imputer_categorical = SimpleImputer(missing_values='0', strategy='most_frequent')
cols = ['production_company_1', 'Lead_Actor', 'Supporting_Actor', 'Director','Producer']
data.loc[:,cols] =imputer_categorical.fit_transform(data.loc[:,cols])

### Categorical encoding - Target Encoding

In [13]:
import category_encoders as ce
from category_encoders import TargetEncoder

encoder_lead_actor = ce.TargetEncoder(cols= 'Lead_Actor') 
data.loc[:,'Lead_Actor'] = encoder_lead_actor.fit_transform(data['Lead_Actor'],data['rating'])

encoder_sup_actor = ce.TargetEncoder(cols= 'Supporting_Actor') 
data.loc[:,'Supporting_Actor'] = encoder_sup_actor.fit_transform(data['Supporting_Actor'],data['rating'])

encoder_director_name = ce.TargetEncoder(cols= 'Director') 
data.loc[:,'Director'] = encoder_director_name.fit_transform(data['Director'],data['rating'])

encoder_producer_name = ce.TargetEncoder(cols= 'Producer') 
data.loc[:,'Producer'] = encoder_producer_name.fit_transform(data['Producer'],data['rating'])

encoder_production_company_1 = ce.TargetEncoder(cols= 'production_company_1') 
data.loc[:,'production_company_1'] = encoder_production_company_1.fit_transform(data['production_company_1'],data['rating'])

### Imputing values for budget

In [14]:
train = data[data['budget'] != 0].copy()
test = data[data['budget'] == 0].copy()
X_train, X_test, y_train, y_test =  train.drop('budget',axis = 1), test.drop('budget',axis = 1), train.loc[:,'budget'], test.loc[:,'budget']
ind = y_test.index

from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

for i in range(ind.shape[0]):
    data.loc[ind[i],'budget'] = y_pred[i]
    

### Bag of Words Model with CountVectorizer

In [15]:
"""import re
import nltk
import pickle
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
def preprocess(df, lst) : 
    for i in range(df.shape[0]) :
        comment = re.sub(r'[?|!|\'|"|#]',r'', df['overview'][i])
        comment = re.sub(r'[.|,|)|(|\|/]',r' ', comment)
        comment = comment.replace("\n"," ")
        comment = re.sub('[^a-zA-Z]',' ', comment)
        comment = comment.lower()
        comment = comment.split()
        ps = SnowballStemmer(language='english')
        comment = [ps.stem(word) for word in comment if not word in set(stopwords.words('english'))]
        comment = ' '.join(comment)
        lst.append(comment)
        if i%1000 == 0 :
            print(str(i)+' comments preprocessed')

    
start1 = datetime.now()    
corpus = []
preprocess(overview, corpus)
end1 = datetime.now()
print('\n..............Corpus created.............')
print('\nTIme taken to create corpus : ', start1 - end1)

cv = CountVectorizer(max_features=500)

ovrvw = cv.fit_transform(corpus).toarray()

pd.DataFrame(ovrvw).to_csv('overview_cv.csv',index_label = False)
"""

ovrvw = pd.read_csv(r'overview_cv.csv')

In [16]:
data = data.drop(data.iloc[:,3:16].columns, axis = 1)

In [17]:
y = data.loc[:,'rating'].values
X = data.loc[:,:'rating'].values
print('-------------| X Shape : {} |-------------'.format(X.shape))

-------------| X Shape : (6634, 29) |-------------


In [18]:
X = np.hstack((X,ovrvw))

In [19]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X = sc.fit_transform(X)

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [21]:
from sklearn.svm import SVR
# from sklearn.model_selection import GridSearchCV

start = datetime.now()

# param = {'alpha' : np.linspace(0.0001, 1, 100)}

# grids = GridSearchCV(Lasso(), param_grid = param, cv = 10,scoring = 'neg_mean_absolute_error', n_jobs = -1)

grids = SVR(kernel = 'linear')

grids.fit(X_train,y_train)

end = datetime.now()

print('-------------| Time to fit the model : {} |-------------'.format(start - end))

# print('-------------| Best Paramters : {} |-------------'.format(grids.best_params_))

-------------| Time to fit the model : -1 day, 23:59:55.168926 |-------------


In [22]:
y_train_pred = grids.predict(X_train)

from sklearn.metrics import mean_absolute_error

print('-------------| Mean Absolute Error on Train set : {} |-------------'.format(mean_absolute_error(y_train, y_train_pred)))

-------------| Mean Absolute Error on Train set : 0.04069678734438995 |-------------


In [23]:
y_pred = grids.predict(X_test)

from sklearn.metrics import mean_absolute_error

print('-------------| Mean Absolute Error on Test set : {} |-------------'.format(mean_absolute_error(y_test, y_pred)))

-------------| Mean Absolute Error on Test set : 0.04102252110569533 |-------------
