# **Installs**

In [None]:
! pip install skimpy
! pip install lightgbm

#**Imports**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json 
from google.colab import drive
from sklearn.linear_model import LinearRegression
from skimpy import skim
from sklearn import set_config
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from numpy import mean
from numpy import std
from numpy import absolute
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

# **Carga de datos**

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

drive.mount('/content/drive', force_remount=True)
with open("/content/drive/MyDrive/ANALISIS PREDICTIVO/TP 2/kaggle.json", 'r') as f:
    api_token= json.load(f)

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c ap-tp2-2q2022


import zipfile
import os

os.listdir()

for file in os.listdir():
    if file.endswith('.zip'):
      zip_ref = zipfile.ZipFile(file, 'r')
      zip_ref.extractall()
      zip_ref.close()

In [None]:
db=pd.read_csv("origen.csv", sep=",")

# **Vista de los datos**

In [None]:
db.columns

In [None]:
db.info()

In [None]:
pd.options.display.max_columns = None

In [None]:
db.head(5)

In [None]:
skim(db)

# **Distribucion de variable Target**

In [None]:
db['averageRating'].mean()

In [None]:
db['averageRating'].median()

In [None]:
sns.displot(db, x="averageRating")

# **Correlaciones**

In [None]:
fig,ax=plt.subplots(figsize=(15,15))
sns.heatmap(db.corr(),annot=True)

# **Variables numericas**

In [None]:
db.describe().T

## Id,Budget y Revenue

In [None]:
db=db.drop(['Id','revenue','budget'],axis=1)

In [None]:
db['popularity']=db['popularity'].fillna(0)

In [None]:
db['popularity'].value_counts()

In [None]:
sns.displot(db['popularity'])

## isAdult

In [None]:
db['isAdult'].value_counts()

2020 no deberia formar parte

In [None]:
db.loc[db['isAdult'] == 2020]
db = db[db.isAdult != 2020]

In [None]:
db['isAdult']=db['isAdult'].astype(int)

## EndYear

In [None]:
db['endYear'].describe()

In [None]:
sns.displot(db,x='endYear')

##RuntimeMinutes vs Runtime

In [None]:
db.loc[:,['runtime', 'runtimeMinutes']].head(10)
miss_runtime=db['runtime'].isnull().sum()
miss_runtime_min=db['runtimeMinutes'].isnull().sum()

In [None]:
runtime_notmiss=db.loc[db['runtime'].isnull()==False]
runtime_notmiss.loc[:,['runtime', 'runtimeMinutes']].head(10)

In [None]:
print("Porcentaje de Missings runtime:",(db['runtime'].isnull().sum()/db.shape[0])*100)

In [None]:
print("Porcentaje de Missings runtimeMinutes:",(db['runtimeMinutes'].isnull().sum()/db.shape[0])*100)

**runtimeMinutes** es mas completa que **runtime**

In [None]:
db=db.drop(['runtime'],axis=1)

## SeasonNumber

In [None]:
db['seasonNumber'] = db['seasonNumber'].astype(int)

## EpisodeNumber

In [None]:
db['episodeNumber'] = db['episodeNumber'].astype(int)

## Ordering

In [None]:
db.loc[db['tagline']== "There won't be a dry seat in the house!" ]

In [None]:
db=db.drop(['ordering'],axis=1)

## isOriginalTitle

In [None]:
db['isOriginalTitle']=db['isOriginalTitle'].astype(int)

# **Variables Categoricas**

In [None]:
db.describe(include='object').T

## Attributes

In [None]:
db['attributes'].value_counts()

In [None]:
db.loc[db['tagline']== "There won't be a dry seat in the house!"]

Para la misma pelicula, tiene distintos **attributes** por lo tanto se tiene que sacar.

In [None]:
db=db.drop(['attributes'],axis=1)

## Genres

In [None]:
genres_y_t=db.loc[db['genres_y'].isnull()==False]
genres_y_t.loc[:,['genres_x', 'genres_y']].head(10)

In [None]:
print("Porcentaje de Missings genres_x:",(db['genres_x'].isnull().sum()/db.shape[0])*100)

In [None]:
print("Porcentaje de Missings genres_y:",(db['genres_y'].isnull().sum()/db.shape[0])*100)

No tiene sentido tener ambas porque contienen la misma informacion. Genres_x tiene menos missing

In [None]:
db=db.drop(['genres_y'],axis=1)

## Genres_x

In [None]:
lista=new = db["genres_x"].str.split(",", expand = True)
list_0=pd.DataFrame(lista[0].value_counts().rename_axis('unique_values').reset_index(name='counts'))
list_1=pd.DataFrame(lista[1].value_counts().rename_axis('unique_values').reset_index(name='counts'))
list_2=pd.DataFrame(lista[2].value_counts().rename_axis('unique_values').reset_index(name='counts'))
res = pd.concat([list_0, list_1,list_2]).groupby(['unique_values']).sum().reset_index()

In [None]:
top_5_generos = res.sort_values('counts',ascending=False)['unique_values'].head(5)
top_5_generos

In [None]:
df=[]
def func(a, b):
    return not set(a).isdisjoint(b)
for i in db['genres_x']:
  x = i.split(",")
  respuesta=func(x,top_5_generos)
  df.append(respuesta)
db.insert(8,'genero',df)

In [None]:
db['genero'] = db['genero'].astype(int)

## Directors

In [None]:
db['directors'].head(10)

In [None]:
lista=new = db["directors"].str.split(",",n=3, expand = True)
list_0=pd.DataFrame(lista[0].value_counts().rename_axis('unique_values').reset_index(name='counts'))
list_1=pd.DataFrame(lista[1].value_counts().rename_axis('unique_values').reset_index(name='counts'))
list_2=pd.DataFrame(lista[2].value_counts().rename_axis('unique_values').reset_index(name='counts'))
res = pd.concat([list_0, list_1,list_2]).groupby(['unique_values']).sum().reset_index()

In [None]:
top20_directores = res.sort_values('counts',ascending=False)['unique_values'].head(21)
top20_directores = top20_directores[top20_directores!= '0']
top20_directores = top20_directores[top20_directores!= 'missing']

In [None]:
df=[]
def func(a, b):
    return not set(a).isdisjoint(b)
for i in db['directors']:
  x = i.split(",")
  respuesta=func(x,top20_directores)
  df.append(respuesta)
db.insert(10,'directores',df)

In [None]:
db["directores"] = db["directores"].astype(int)

## Writers

In [None]:
db['writers'].head(10)

In [None]:
lista=new = db["writers"].str.split(",",n=3, expand = True)
list_0=pd.DataFrame(lista[0].value_counts().rename_axis('unique_values').reset_index(name='counts'))
list_1=pd.DataFrame(lista[1].value_counts().rename_axis('unique_values').reset_index(name='counts'))
list_2=pd.DataFrame(lista[2].value_counts().rename_axis('unique_values').reset_index(name='counts'))
res = pd.concat([list_0, list_1,list_2]).groupby(['unique_values']).sum().reset_index()

In [None]:
top20_escritores = res.sort_values('counts',ascending=False)['unique_values'].head(20)
top20_escritores = top20_escritores[top20_escritores!= '0']
top20_escritores = top20_escritores[top20_escritores!= 'missing']

In [None]:
df=[]
def func(a, b):
    return not set(a).isdisjoint(b)
for i in db['writers']:
  x = i.split(",")
  respuesta=func(x,top20_escritores)
  df.append(respuesta)
db.insert(11,'escritores',df)

In [None]:
db["escritores"] = db["escritores"].astype(int)

## final

In [None]:
db=db.drop(['writers','directors','genres_x',],axis=1)

In [None]:
db=db.drop(['language','adult','original_language','overview','production_companies',
            'production_countries','release_date','status','tagline','video'],axis=1)

# **Missings**

In [None]:
db.isnull().sum()

In [None]:
#sns.heatmap(db.isnull(), cbar=False)

## SeasonNumber

In [None]:
db['seasonNumber'].describe()

In [None]:
snumber_null=db.loc[db['seasonNumber'].isnull()==True]
snumber_null.groupby("titleType")["isAdult"].count()

Como la mayoria son tipos que no tienen series, se los reemplaza con 0.

In [None]:
db.loc[db["seasonNumber"].isnull()==True, "seasonNumber"] = 0
db.groupby('titleType')["isAdult"].value_counts()

## EpisodeNumber

In [None]:
epNumber_null= db.loc[db['episodeNumber'].isnull()==True]
epNumber_null.groupby("titleType")["isAdult"].count()

In [None]:
db['episodeNumber']=db['episodeNumber'].fillna(0)
db['episodeNumber'].isnull().sum()

## isOriginalTitle

In [None]:
db['isOriginalTitle'].value_counts()

In [None]:
db['isOriginalTitle']=db['isOriginalTitle'].fillna(0)

In [None]:
sns.displot(db['isOriginalTitle'])

## Genres_x

In [None]:
db.loc[db["genres_x"].isnull()==True, "genres_x"] = "missing"

## Directors

In [None]:
db.loc[db["writers"].isnull()==True, "writers"] = "missing"

## Writers

In [None]:
db.loc[db["writers"].isnull()==True, "writers"] = "missing"

## Total

In [None]:
db.isnull().sum()

In [None]:
#sns.heatmap(db.isnull(), cbar=False)

# **Outliers**

## NumVotes

In [None]:
db['numVotes'].describe()

In [None]:
plt.boxplot(db['numVotes'])
plt.show()

In [None]:
db[db['numVotes']>150].count()
#db=db[db.numVotes <150]

## StartYear

In [None]:
db['startYear'].describe()

In [None]:
sns.displot(db,x='startYear')

In [None]:
db = db[db.startYear > 1000]
sns.displot(db,x='startYear')

## RuntimeMinutes

In [None]:
db['runtimeMinutes'].describe()

In [None]:
db=db[db.runtimeMinutes >= 0]
#db=db[db.runtimeMinutes <=250]

In [None]:
sns.displot(db['runtimeMinutes'])

In [None]:
sns.boxplot(db['runtimeMinutes'])

## SeasonNumber

In [None]:
sns.boxplot(db['seasonNumber'])

In [None]:
db=db[db.seasonNumber<100]

In [None]:
sns.boxplot(db['seasonNumber'])

## EpisodeNumber

In [None]:
sns.boxplot(db['episodeNumber'])

In [None]:
#db=db[db.episodeNumber<2000]

In [None]:
sns.boxplot(db['episodeNumber'])

# **Duplicados**

In [None]:
dup_db=db.duplicated().any()
count_dup=db.duplicated().sum()
print("Duplicados:",dup_db," y son: ",count_dup)

In [None]:
db=db.drop_duplicates(keep="first")

# **Graficos**

In [None]:
db['titleType'].value_counts().plot(kind='bar', xlabel='numbers', ylabel='frequency')

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.boxplot(x="titleType", 
            y="averageRating", 
            data=db)

In [None]:
sns.boxplot(x="genero", y="averageRating", data=db)

In [None]:
sns.boxplot(x="directores", y="averageRating", data=db)

In [None]:
sns.boxplot(x="escritores", y="averageRating", data=db)

# **Dummies**

In [None]:
db=db.join(pd.get_dummies(db['titleType']))

In [None]:
db=db.drop(['titleType'],axis=1)

# **Correlaciones Finales**

In [None]:
fig,ax=plt.subplots(figsize=(15,15))
sns.heatmap(db.corr(),annot=True)

# **Separacion de Datos**

In [None]:
X = db.drop(columns=['averageRating'])
y = db['averageRating']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=33)

In [None]:
X.columns

In [None]:
X_train.shape

In [None]:
X_val.shape

In [None]:
#from sklearn.preprocessing import StandardScaler
#sc_X = StandardScaler()
#X_train = sc_X.fit_transform(X_train)
#X_val = sc_X.transform(X_val)

# **Modelos**

## **Regresion Lineal**

In [None]:
reg = LinearRegression()

In [None]:
reg.fit(X_train,y_train)

In [None]:
print("R2 score:",reg.score(X_val,y_val))

## **Ridge Regression**

In [None]:
ridge = Ridge(alpha=1.0)

In [None]:
ridge=ridge.fit(X_train, y_train)

In [None]:
print("R2 score:",ridge.score(X_val,y_val))

## **Decision Tree Regressor**

In [None]:
regressor = DecisionTreeRegressor(random_state = 1, max_depth=2)  
regressor.fit(X_train, y_train)

In [None]:
print("R2 score:",regressor.score(X_val, y_val))

## **RandomForest Regression**

In [None]:
random = RandomForestRegressor(max_depth=10, n_estimators=300, max_features=1)
random.fit(X_train, y_train)

In [None]:
print("R2 score:",random.score(X_val,y_val))

In [None]:
model = XGBRegressor()
RFR = RandomForestRegressor(n_estimators = 12, random_state=1)
rr = model.fit(X_train,y_train)
rr.score(X_val, y_val)

## **XG Boost**

In [None]:
XGB=XGBRegressor()

In [None]:
XGB = XGB.fit(X_train,y_train)
print("R2 score:",XGB.score(X_val, y_val))



---



In [None]:
XGB = XGBRegressor(colsample_bytree=0.7, gamma=0.2, leraning_rate=0.12, max_depth=10,
             min_child_weight=7)

In [None]:
XGB.fit(X_train,y_train)

In [None]:
XGB.score(X_val,y_val)



---



**RandomSearch**

In [None]:
xgbr=XGBRegressor()

In [None]:
params ={
    "leraning_rate"    : [0.1,0.12,0.15,0.17,0.2,0.25],
    "max_depth"        : [6,7,8,9,10,11],
    "min_child_weight" : [1,3,5,7],
    "gamma"            : [0.0, 0.1, 0.2, 0.3, 0.4],
    "colsample_bytree" : [0.3, 0.4, 0.5, 0.7]
}

In [None]:
random_search= RandomizedSearchCV(xgbr, param_distributions = params, n_iter = 5, n_jobs = -1, cv=5, verbose = 0)

In [None]:
random_search.fit(X_train,y_train)

In [None]:
random_search.best_estimator_

In [None]:
XGB_modelo=XGBRegressor(colsample_bytree=0.7, gamma=0.3, leraning_rate=0.1, max_depth=11,
             min_child_weight=7)

In [None]:
XGB=XGB_modelo.fit(X_train,y_train)

In [None]:
XGB.score(X_val,y_val)



---



In [None]:
int=XGBRegressor(colsample_bytree=0.7, gamma=0.0, leraning_rate=0.1, max_depth=10,
             min_child_weight=3)

In [None]:
rer=int.fit(X_train,y_train)

In [None]:
rer.score(X_val,y_val)

## **HistGradient Boosting Regressor**

In [None]:
from sklearn.ensemble import HistGradientBoostingRegressor

In [None]:
HGBR=HistGradientBoostingRegressor()

In [None]:
HGBR.fit(X_train,y_train)

In [None]:
print("R2 score:",HGBR.score(X_val,y_val))



---



In [None]:
HGBR=HistGradientBoostingRegressor(max_bins=254,min_samples_leaf=20,max_leaf_nodes=254,max_iter=100,learning_rate=0.1,
                                   verbose=0,l2_regularization=0)

In [None]:
HGBR.fit(X_train,y_train)

In [None]:
HGBR.score(X_val,y_val)

## **Gradient Boosting**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
GBR = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
print("R2 score:",GBR.score(X_val,y_val))

##**LGBM** 

In [None]:
LGBM=lgb.LGBMRegressor()

In [None]:
LGBM.fit(X_train,y_train)

In [None]:
print("R2 score:",LGBM.score(X_val,y_val))

**Ajuste de Hiperparametros**

In [None]:
param_grid = {
    'n_estimators': [400, 700, 1000],
    'max_depth': [15,20,25],
    'num_leaves': [50, 100, 200],
    'reg_alpha': [1.1, 1.2, 1.3],
    'reg_lambda': [1.1, 1.2, 1.3],
    'min_split_gain': [0.3, 0.4],
    'subsample': [0.7, 0.8, 0.9],
    'subsample_freq': [20]
}

In [None]:
random_search= RandomizedSearchCV(LGBM, param_distributions = param_grid, n_iter = 5, n_jobs = -1, cv=5, verbose = 0)

In [None]:
random_search.fit(X,y)

In [None]:
random_search.best_estimator_

In [None]:
random_LGBM=lgb.LGBMRegressor(max_depth=25, min_split_gain=0.4,
              n_estimators=1000, num_leaves=200, reg_alpha=1.3, reg_lambda=1.2,
              subsample=0.7, subsample_freq=20)

In [None]:
modelo_LGBM=random_LGBM.fit(X_train,y_train)

In [None]:
print("R2 score:", modelo_LGBM.score(X_val,y_val))

## **Catboost**

In [None]:
! pip install catboost

In [None]:
import catboost as cb
train_dataset = cb.Pool(X_train, y_train) 
test_dataset = cb.Pool(X_val, y_val)

In [None]:
model = cb.CatBoostRegressor(loss_function='RMSE')

In [None]:
grid = {'iterations': [100, 150, 200],
        'learning_rate': [0.03, 0.1],
        'depth': [2, 4, 6, 8],
        'l2_leaf_reg': [0.2, 0.5, 1, 3]}
model.grid_search(grid, train_dataset)

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
pred = model.predict(X_val)
rmse = (np.sqrt(mean_squared_error(y_val, pred)))
r2 = r2_score(y_val, pred)
print("Testing performance")
print('RMSE: {:.2f}'.format(rmse))
print('R2: {:.2f}'.format(r2))

# **SALIDA**

## **Carga de datos**

In [None]:
test=pd.read_csv("testear.csv", sep=",")

In [None]:
test_t=pd.DataFrame()

In [None]:
test_t=test

## **Missings**

### SeasonNumber

In [None]:
test_t.loc[test_t["seasonNumber"].isnull()==True, "seasonNumber"] = 0

### EpisodeNumber

In [None]:
test_t['episodeNumber']=test_t['episodeNumber'].fillna(0)

### isOriginalTitle

In [None]:
test_t['isOriginalTitle']=test_t['isOriginalTitle'].fillna(0)

### Popularity

In [None]:
test_t['popularity']=test_t['popularity'].fillna(0)

### Genres_x

In [None]:
test_t.loc[test_t["genres_x"].isnull()==True, "genres_x"] = "missing"

### Directors

In [None]:
test_t.loc[test_t["writers"].isnull()==True, "writers"] = "missing"

### Writers

In [None]:
test_t.loc[test_t["writers"].isnull()==True, "writers"] = "missing"

## **Variables numericas**

### Id,Budget y Revenue

In [None]:
test_t=test_t.drop(['Id','revenue','budget'],axis=1)

### isAdult,isOriginalTitle,seasonNumber,episodeNumber

In [None]:
test_t = test_t.astype({'isAdult':'int64', 'isOriginalTitle':'int64','seasonNumber':'int64','episodeNumber':'int64'})

###RuntimeMinutes vs Runtime

In [None]:
test_t=test_t.drop(['runtime'],axis=1)

### Ordering

In [None]:
test_t=test_t.drop(['ordering'],axis=1)

## **Variables Categoricas**

### Attributes

In [None]:
test_t=test_t.drop(['attributes'],axis=1)

### Genres

In [None]:
test_t=test_t.drop(['genres_y'],axis=1)

### Genres_x

In [None]:
df=[]
def func(a, b):
    return not set(a).isdisjoint(b)
for i in test_t['genres_x']:
  x = i.split(",")
  respuesta=func(x,top_5_generos)
  df.append(respuesta)
test_t.insert(8,'genero',df)

### Directors

In [None]:
df=[]
def func(a, b):
    return not set(a).isdisjoint(b)
for i in test_t['directors']:
  x = i.split(",")
  respuesta=func(x,top20_directores)
  df.append(respuesta)
test_t.insert(10,'directores',df)

### Writers

In [None]:
df=[]
def func(a, b):
    return not set(a).isdisjoint(b)
for i in test_t['writers']:
  x = i.split(",")
  respuesta=func(x,top10_escritores)
  df.append(respuesta)
test_t.insert(11,'escritores',df)

### final

In [None]:
test_t=test_t = test_t.astype({'genero':'int64', 'directores':'int64','escritores':'int64'})

In [None]:
test_t=test_t.drop(['writers','directors','genres_x',],axis=1)

In [None]:
test_t=test_t.drop(['language','adult','original_language','overview','production_companies',
            'production_countries','release_date','status','tagline','video'],axis=1)

## **Dummies**

In [None]:
test_t=test_t.join(pd.get_dummies(test_t['titleType']))

In [None]:
test_t=test_t.drop(['titleType'],axis=1)

## **Final**

In [None]:
test_t.head(10)

In [None]:
resultado=modelo_LGBM.predict(test_t)

In [None]:
salida = pd.DataFrame(data={"averageRating": resultado}).astype(str)
salida.index = test.Id
salida.to_csv("LGBM.csv", sep=',',index=True,  index_label='Id')