<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Merge-the-data" data-toc-modified-id="Merge-the-data-0.1">Merge the data</a></span></li></ul></li><li><span><a href="#Regression" data-toc-modified-id="Regression-1">Regression</a></span></li><li><span><a href="#Selecting-best-features" data-toc-modified-id="Selecting-best-features-2">Selecting best features</a></span></li><li><span><a href="#ML" data-toc-modified-id="ML-3">ML</a></span></li></ul></div>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

# Deep learning 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


# tmdb 5000 DATA

In [2]:
tmdb_5000_cred = pd.read_csv(r'D:\OneDrive - NITT\Custom_Download\tmdb_5000_credits.csv', index_col=False)
tmdb_5000_cred.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [3]:
tmdb_5000_cred.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  4803 non-null   int64 
 1   title     4803 non-null   object
 2   cast      4803 non-null   object
 3   crew      4803 non-null   object
dtypes: int64(1), object(3)
memory usage: 150.2+ KB


In [4]:
tmdb_5000_mov = pd.read_csv(r'D:\OneDrive - NITT\Custom_Download\tmdb_5000_movies.csv',index_col=False)
tmdb_5000_mov.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


In [5]:
tmdb_5000_mov.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

#### Merge the data

In [6]:
tmdb_5000_cred.columns = ['id','tittle','cast','crew']
tmdb_5000_mov = tmdb_5000_mov.merge(tmdb_5000_cred,on='id')

In [7]:
tmdb_5000_mov.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4803 entries, 0 to 4802
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [8]:
tmdb_5000_mov.shape

(4803, 23)

In [9]:
# Checking to see if we can merge the data we got from API and the one from kaggle.

In [10]:
# Merging the data based on the original name 

In [11]:
tmdb_5000_mov[:2].T

Unnamed: 0,0,1
budget,237000000,300000000
genres,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""..."
homepage,http://www.avatarmovie.com/,http://disney.go.com/disneypictures/pirates/
id,19995,285
keywords,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na..."
original_language,en,en
original_title,Avatar,Pirates of the Caribbean: At World's End
overview,"In the 22nd century, a paraplegic Marine is di...","Captain Barbossa, long believed to be dead, ha..."
popularity,150.437577,139.082615
production_companies,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""..."


**Pularity** IMDbPro uses proprietary algorithms that take into account several measures of popularity for people, titles and companies. The primary measure is who and what people are looking at on IMDb. The rankings are updated on a weekly basis, typically by the end of Monday.

# NEW CODE

In [12]:
tmdb_5000_mov[tmdb_5000_mov['revenue']!=0]

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,tittle,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4775,0,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 35, ""name...",,33693,"[{""id"": 171993, ""name"": ""mumblecore""}]",en,Funny Ha Ha,"Unsure of what to do next, 23-year-old Marnie ...",0.362633,[],...,85.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,Funny Ha Ha,6.3,8,Funny Ha Ha,"[{""cast_id"": 1, ""character"": ""Marnie"", ""credit...","[{""credit_id"": ""52fe45309251416c9102a535"", ""de..."
4788,12000,"[{""id"": 27, ""name"": ""Horror""}, {""id"": 35, ""nam...",,692,"[{""id"": 237, ""name"": ""gay""}, {""id"": 900, ""name...",en,Pink Flamingos,Notorious Baltimore criminal and underground f...,4.553644,"[{""name"": ""Dreamland Productions"", ""id"": 407}]",...,93.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,An exercise in poor taste.,Pink Flamingos,6.2,110,Pink Flamingos,"[{""cast_id"": 8, ""character"": ""Divine / Babs Jo...","[{""credit_id"": ""52fe426bc3a36847f801d203"", ""de..."
4792,20000,"[{""id"": 80, ""name"": ""Crime""}, {""id"": 27, ""name...",,36095,"[{""id"": 233, ""name"": ""japan""}, {""id"": 549, ""na...",ja,キュア,A wave of gruesome murders is sweeping Tokyo. ...,0.212443,"[{""name"": ""Daiei Studios"", ""id"": 881}]",...,111.0,"[{""iso_639_1"": ""ja"", ""name"": ""\u65e5\u672c\u8a...",Released,Madness. Terror. Murder.,Cure,7.4,63,Cure,"[{""cast_id"": 3, ""character"": ""Kenichi Takabe"",...","[{""credit_id"": ""52fe45cc9251416c9103eb7b"", ""de..."
4796,7000,"[{""id"": 878, ""name"": ""Science Fiction""}, {""id""...",http://www.primermovie.com,14337,"[{""id"": 1448, ""name"": ""distrust""}, {""id"": 2101...",en,Primer,Friends/fledgling entrepreneurs invent a devic...,23.307949,"[{""name"": ""Thinkfilm"", ""id"": 446}]",...,77.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,What happens if it actually works?,Primer,6.9,658,Primer,"[{""cast_id"": 1, ""character"": ""Aaron"", ""credit_...","[{""credit_id"": ""52fe45e79251416c75066791"", ""de..."


In [13]:
tmdb_5000_mov.shape

(4803, 23)

In [14]:
list(tmdb_5000_mov)

['budget',
 'genres',
 'homepage',
 'id',
 'keywords',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count',
 'tittle',
 'cast',
 'crew']

In [15]:
df = tmdb_5000_mov.copy()

In [16]:
imp_cols = ['budget', 'genres','popularity','original_language','runtime','vote_average','vote_count']

In [17]:
df = df[imp_cols]

In [18]:
def get_val(dictionary_list):
    val = [d['name'] for d in eval(dictionary_list)]
    return val

In [19]:
from tqdm.notebook import tqdm
tqdm.pandas()
df['genres'] = df['genres'].progress_apply(get_val)

  0%|          | 0/4803 [00:00<?, ?it/s]

In [20]:
df

Unnamed: 0,budget,genres,popularity,original_language,runtime,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",150.437577,en,162.0,7.2,11800
1,300000000,"[Adventure, Fantasy, Action]",139.082615,en,169.0,6.9,4500
2,245000000,"[Action, Adventure, Crime]",107.376788,en,148.0,6.3,4466
3,250000000,"[Action, Crime, Drama, Thriller]",112.312950,en,165.0,7.6,9106
4,260000000,"[Action, Adventure, Science Fiction]",43.926995,en,132.0,6.1,2124
...,...,...,...,...,...,...,...
4798,220000,"[Action, Crime, Thriller]",14.269792,es,81.0,6.6,238
4799,9000,"[Comedy, Romance]",0.642552,en,85.0,5.9,5
4800,0,"[Comedy, Drama, Romance, TV Movie]",1.444476,en,120.0,7.0,6
4801,0,[],0.857008,en,98.0,5.7,7


In [21]:
all_genre = set(sum(df['genres'],[]))

In [22]:
for gen in tqdm(all_genre):
    df[gen] = df['genres'].apply(lambda x: 1 if gen in x else 0)

  0%|          | 0/20 [00:00<?, ?it/s]

In [23]:
df.drop('genres',axis=1,inplace=True)

In [24]:
df = pd.get_dummies(df)

In [25]:
df.head()

Unnamed: 0,budget,popularity,runtime,vote_average,vote_count,Music,Thriller,Comedy,Romance,Mystery,...,original_language_ru,original_language_sl,original_language_sv,original_language_ta,original_language_te,original_language_th,original_language_tr,original_language_vi,original_language_xx,original_language_zh
0,237000000,150.437577,162.0,7.2,11800,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,300000000,139.082615,169.0,6.9,4500,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,245000000,107.376788,148.0,6.3,4466,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,250000000,112.31295,165.0,7.6,9106,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,260000000,43.926995,132.0,6.1,2124,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
df.dropna(inplace=True)

In [27]:
X = df.drop('popularity',axis=1)
y = df['popularity']

In [28]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [29]:
X_train.nunique()

budget                   382
runtime                  150
vote_average              70
vote_count              1419
Music                      2
                        ... 
original_language_th       2
original_language_tr       2
original_language_vi       2
original_language_xx       2
original_language_zh       2
Length: 61, dtype: int64

In [31]:
# scale
from sklearn.preprocessing import StandardScaler

In [32]:
scale = StandardScaler()

In [33]:
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

In [34]:
X_train

array([[-4.11545474e-01, -5.54399433e-01, -4.18814318e-01, ...,
        -1.61395322e-02, -1.61395322e-02, -7.23574605e-02],
       [-5.12018910e-01, -6.40394498e-01,  5.03586058e-01, ...,
        -1.61395322e-02, -1.61395322e-02, -7.23574605e-02],
       [-7.02918438e-01, -6.83392030e-01,  1.42598643e+00, ...,
        -1.61395322e-02, -1.61395322e-02, -7.23574605e-02],
       ...,
       [-7.12965782e-01, -1.24424109e-01,  5.87440637e-01, ...,
        -1.61395322e-02, -1.61395322e-02, -7.23574605e-02],
       [-7.12965782e-01, -4.68404368e-01,  4.58579732e-04, ...,
        -1.61395322e-02, -1.61395322e-02, -7.23574605e-02],
       [ 6.68543961e-01,  5.20538876e-01,  6.71295217e-01, ...,
        -1.61395322e-02, -1.61395322e-02, -7.23574605e-02]])

### Regression

In [None]:
# Running the OLS regression model. 
X_train # Using the best features for the model
X_train_int = sm.add_constant(X_train) #Fitting the training data
model_3 = sm.OLS(y_train, X_train).fit()
model_3.summary()

0,1,2,3
Dep. Variable:,popularity,R-squared (uncentered):,0.422
Model:,OLS,Adj. R-squared (uncentered):,0.413
Method:,Least Squares,F-statistic:,47.59
Date:,"Thu, 27 Apr 2023",Prob (F-statistic):,0.0
Time:,14:10:47,Log-Likelihood:,-18324.0
No. Observations:,3840,AIC:,36760.0
Df Residuals:,3782,BIC:,37130.0
Df Model:,58,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
x1,1.1981,0.661,1.812,0.070,-0.098,2.495
x2,0.1372,0.590,0.232,0.816,-1.021,1.295
x3,1.3016,0.554,2.351,0.019,0.216,2.387
x4,22.6623,0.625,36.246,0.000,21.436,23.888
x5,-0.4947,0.601,-0.823,0.410,-1.673,0.684
x6,0.3397,0.567,0.599,0.549,-0.771,1.451
x7,0.0519,0.495,0.105,0.916,-0.918,1.021
x8,-0.2435,0.578,-0.421,0.674,-1.377,0.890
x9,1.2181,0.579,2.104,0.035,0.083,2.353

0,1,2,3
Omnibus:,8486.793,Durbin-Watson:,0.885
Prob(Omnibus):,0.0,Jarque-Bera (JB):,85267109.26
Skew:,19.746,Prob(JB):,0.0
Kurtosis:,731.944,Cond. No.,4.51e+16


### Selecting best features 

In [None]:
# Select the best features using SelectKBest
# Select the top 5 features
selector = SelectKBest(score_func=f_regression, k=5)  
X_train_selected = selector.fit_transform(X_train, y_train)

# Add a constant column to the selected features
X_train_int = sm.add_constant(X_train_selected)

# Create the OLS model
model = sm.OLS(y_train, X_train_int)

# Fit the model
result = model.fit()

# Print the summary
result.summary()

0,1,2,3
Dep. Variable:,popularity,R-squared:,0.616
Model:,OLS,Adj. R-squared:,0.616
Method:,Least Squares,F-statistic:,1233.0
Date:,"Thu, 27 Apr 2023",Prob (F-statistic):,0.0
Time:,14:10:49,Log-Likelihood:,-16795.0
No. Observations:,3840,AIC:,33600.0
Df Residuals:,3834,BIC:,33640.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,21.2720,0.310,68.613,0.000,20.664,21.880
x1,1.5548,0.415,3.748,0.000,0.742,2.368
x2,-0.3357,0.344,-0.975,0.329,-1.010,0.339
x3,1.1783,0.349,3.376,0.001,0.494,1.863
x4,22.8135,0.406,56.137,0.000,22.017,23.610
x5,0.9550,0.338,2.824,0.005,0.292,1.618

0,1,2,3
Omnibus:,8462.652,Durbin-Watson:,1.979
Prob(Omnibus):,0.0,Jarque-Bera (JB):,84728670.412
Skew:,19.607,Prob(JB):,0.0
Kurtosis:,729.647,Cond. No.,2.5


In [None]:
# Assuming X_test and y_test are the test data
X_test_selected = selector.transform(X_test)
X_test_int = sm.add_constant(X_test_selected)

# Make predictions using the fitted model
y_pred = result.predict(X_test_int)

# Calculate the MSE
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)


In [None]:
# Create an empty DataFrame
results_df = pd.DataFrame()
# Add results to the DataFrame
#Create a new row with the results
new_row = {'Model': 'OLS', 'MSE': mse, 'RMSE': rmse}

# Append the new row to the results DataFrame
results_df = results_df.append(new_row, ignore_index=True)
results_df

Unnamed: 0,Model,MSE,RMSE
0,OLS,499.296984,22.344954


### ML

In [None]:
# MLP
reg = RandomForestRegressor()

In [None]:
reg.fit(X_train,y_train)

In [None]:
y_pred = reg.predict(X_test)

In [None]:
mean_squared_error(y_test,y_pred)

621.7889771246539

In [None]:
mean_squared_error(y_test,y_pred,squared=False)

24.93569684457713

In [None]:
d = pd.DataFrame({'True':y_test,'Pred':y_pred})

In [None]:
d

# https://www.google.com/search?q=mse+vs+rmse+sklearn&oq=&aqs=chrome.1.69i57j69i59j0i433i650j46i131i199i433i465i512l2j0i131i433i512j0i433i650j0i433i512j0i131i433i512j0i433i512.182474j0j4&sourceid=chrome&ie=UTF-8#ip=1

Unnamed: 0,True,Pred
596,13.267631,16.396622
4509,0.001586,0.927793
3050,9.525626,9.854925
2958,32.943848,30.565685
8,98.885637,107.257673
...,...,...
198,39.448066,39.603369
2422,32.746486,47.464520
1485,7.339908,7.677627
402,24.107835,23.150654


In [None]:
def run_regression_models(X_train, y_train, X_test, y_test):
    # Initialize the models
    models = {
        'Random Forest': RandomForestRegressor(),
        'XGBoost': XGBRegressor(),
        'AdaBoost': AdaBoostRegressor()
    }

    # Initialize the DataFrame to store results
    results_df_ml = pd.DataFrame(columns=['Model', 'MSE', 'RMSE'])

    # Loop through each model
    for model_name, model in models.items():
        # Fit the model
        model.fit(X_train, y_train)

        # Make predictions
        y_pred = model.predict(X_test)

        # Calculate MSE and RMSE
        mse = mean_squared_error(y_test, y_pred)
        rmse = mean_squared_error(y_test, y_pred, squared=False)

        # Add results to the DataFrame
        results_df_ml = results_df_ml.append({
            'Model': model_name,
            'MSE': mse,
            'RMSE': rmse
        }, ignore_index=True)

    return results_df_ml



In [None]:
results_df_ml = run_regression_models(X_train, y_train, X_test, y_test)
results_df_ml

Unnamed: 0,Model,MSE,RMSE
0,Random Forest,583.65422,24.158937
1,XGBoost,883.878978,29.730102
2,AdaBoost,843.287712,29.039417


In [None]:
results_df_ml= results_df_ml.append(results_df, ignore_index=True)

# Print the updated DataFrame
results_df_ml

Unnamed: 0,Model,MSE,RMSE
0,Random Forest,583.65422,24.158937
1,XGBoost,883.878978,29.730102
2,AdaBoost,843.287712,29.039417
3,OLS,499.296984,22.344954


In [38]:
# xgboost
# decision tree
# Adaboost
# MLP
# *****


# embedding - huggingface

In [34]:
X_train.shape[1]

61

In [35]:
# Set the input shape
input_shape = (X_train.shape[1],)
print(f'Feature shape: {input_shape}')

Feature shape: (61,)


In [36]:
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
# Initializing callback
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='mean_squared_error', min_delta=1e-4, patience=5)]

In [37]:
# Create the model
model = Sequential()
model.add(Dense(16, input_shape=input_shape, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))

# Configure the model and start training
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])
model.fit(X_train, y_train, epochs=5, batch_size=32, 
          verbose=1, validation_split=0.2,callbacks=callbacks)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1eb199f4eb0>

In [39]:
c = np.array([[1,2,3],[4,5,6]])
c.shape

(2, 3)

In [40]:
c

array([[1, 2, 3],
       [4, 5, 6]])

In [42]:
c = c.ravel()

In [44]:
c

array([1, 2, 3, 4, 5, 6])

In [45]:
# true vs predicted
out = pd.DataFrame({
    'y_true':y_test,
    'y_pred':model.predict(X_test).ravel()
}
).T


out



Unnamed: 0,596,4509,3050,2958,8,577,3565,811,4640,1538,...,1971,551,2088,2930,3335,198,2422,1485,402,2760
y_true,13.267631,0.001586,9.525626,32.943848,98.885637,42.957216,6.470766,29.97224,0.619348,32.217425,...,3.572339,21.789615,9.662715,7.645979,4.943588,39.448066,32.746486,7.339908,24.107835,66.11334
y_pred,-568.501831,-10.6034,-78.220589,-18.064339,-1603.001831,-477.001831,-35.876839,-305.501831,1.704305,-320.251831,...,-197.751831,-552.501831,-310.876831,-84.501839,-63.251839,-974.501831,65.810661,-272.001831,-671.251831,206.084106


In [45]:
# # Train the model
# from sklearn.preprocessing import OneHotEncoder

# onehot_encoder = OneHotEncoder()
# y_train_encoded = onehot_encoder.fit_transform(y_train.reshape(-1, 1)).toarray()
# model.fit(X_train, y_train_encoded, epochs=3, batch_size=32, verbose=1)


In [48]:
# Make predictions on the test set
y_pred = model.predict(X_test)



In [49]:
# Calculate mean squared error
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 152132.96802890062


In [50]:
X_train.shape

(3840, 61)

In [51]:
# Calculate root mean squared error
rmse = mean_squared_error(y_test, y_pred,squared=False)
print('Root Mean Squared Error:', round(rmse,3))

Root Mean Squared Error: 390.042


In [52]:
4800/30

160.0

In [53]:
160/60

2.6666666666666665