In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install --upgrade plotnine
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from plotnine import *
from ggplot import *
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import tensorflow as tf

In [None]:
# Reading the Data
df = pd.read_csv("/kaggle/input/imdb-prediction-by-123-of-ai-weekend-hackathon/train_data.csv")
df_test = pd.read_csv("/kaggle/input/imdb-prediction-by-123-of-ai-weekend-hackathon/test_data_with_inputs.csv")

In [None]:
# Displaying the first 10 records
df.head(10)

In [None]:
# Relationship between the imdb score and the profit made by the movie
ggplot(df,aes(x='imdb_score', y='Profit')) +\
    geom_line() +\
    stat_smooth(colour='blue', span=1)

In [None]:
# Top 20 actors of movies based on the imdb rating of the movies

plt.figure(figsize=(10, 8))

# new dataframe with top 20 values
new_df = df.sort_values(by ='imdb_score' , ascending=False)
new_df = new_df.head(20)

# plotting
ax=sns.pointplot(x=new_df['actor_1_name'], y=new_df['imdb_score'], hue=new_df['movie_title'])

ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Top 20 actors of movies based on the imdb rating of the movies

plt.figure(figsize=(10, 8))

# new dataframe with top 20 values
new_df = df.sort_values(by ='imdb_score' , ascending=False)
new_df = new_df.head(20)

# plotting
ax=sns.pointplot(x=new_df['movie_title'], y=new_df['imdb_score'], hue=new_df['genres'])

ax.set_xticklabels(ax.get_xticklabels(), rotation=40, ha="right")
plt.tight_layout()
plt.show()

In [None]:
# Correlation with heat map - to find to which feature is similar to which other
# those above 0.5 corr score have significant overlap in information
# Select only numeric columns from the DataFrame
numeric_df = df.select_dtypes(include=[np.number])
df2 = pd.get_dummies(data = df, columns = ['content_rating'] , prefix = ['content_rating'] , drop_first = True)

# calc corr
corr = numeric_df.corr()
sns.set_context("notebook", font_scale=1.0, rc={"lines.linewidth": 2.5})
plt.figure(figsize=(25,10))

# create a mask so we only see the correlation values once
mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask, 1)] = True
a = sns.heatmap(corr, mask=mask, annot=True, fmt='.2f')
rotx = a.set_xticklabels(a.get_xticklabels(), rotation=90)
roty = a.set_yticklabels(a.get_yticklabels(), rotation=30)

In [None]:
# Define features and target variable
X = df.drop(['imdb_score'], axis=1)  # Features
y = df['imdb_score']  # Target variable

# One-hot encode a categorical feature 
X = pd.get_dummies(X, columns=['director_name','actor_1_name','actor_2_name','genres','movie_title','actor_3_name','plot_keywords','language','country','content_rating'], drop_first=True)

# Separate non-categorical features
non_categorical_columns = [col for col in X.columns if col not in ['director_name', 'actor_1_name','actor_2_name','genres','movie_title','actor_3_name','plot_keywords','language','country','content_rating']]
X_non_categorical = X[non_categorical_columns]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_non_categorical, y, test_size=0.2)


# Fit a Lasso regression model with cross-validation for regularization
lasso = LassoCV(alphas=[ 5,10,100], cv=5, max_iter=20000)
lasso.fit(X_train, y_train)

# Evaluate the model
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Get the most important features based on Lasso regularization
coef_abs = np.abs(lasso.coef_)
feature_names = X.columns
feature_importance = list(zip(coef_abs, feature_names))
feature_importance.sort(reverse=True, key=lambda x: x[0])
top_3_features = feature_importance[:3]
print("Top 3 most important features:")
for feature, importance in top_3_features:
    print(f"{feature}: {importance}")