In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# import necessary modules
import pandas as pd
import matplotlib
import seaborn as sns
import numpy as np
import sklearn
import matplotlib.pyplot as plt
# printing the versions
print("Pandas: ", pd.__version__)
print("matplotlib: ", matplotlib.__version__)
print("Scikit-learn: ", sklearn.__version__)
print("seaborn: ", sns.__version__)
print("Numpy: ", np.__version__)

In [None]:
# scikit learn modules for predictions
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [None]:
# loading the data
games = pd.read_csv('../input/games.csv')

In [None]:
# exploring the dataset
# printing the shape of the dataset
print("The dataset has ", games.shape[0], " rows and ", games.shape[1], " columns.")

In [None]:
# print out the columns present in the dataset
print("Features in the dataset: \n", list(games.columns))

In [None]:
# change the plot style to "Fivethirtyeight"
plt.style.use('fivethirtyeight')

In [None]:
# plot the distribution of average_rating
plt.hist(games["average_rating"])
plt.show()

So It's kind of unusual as we can see a large number of ratings falls for '0', so before doing any prediction we need to cross check this.

In [None]:
# let's get some insight about the data, which means how the data values are there 
display(games.head())

In [None]:
# why not we see the data types of each column variable
display(games.info())

## Insights:

* In the data we dont have much missing valus.
* 10 float type, 8 int type and 2 object type

In [None]:
# let's see the missing value percentage
columns = games.columns
percent_missing = games.isnull().sum() * 100 / len(games)
missing_value_games = pd.DataFrame({
    'Column_name': columns,
    'missing_percent' : percent_missing
})
display(missing_value_games)

### Missing values Insight:
* The missing value table clearly showing that the missing value percentage is very very low. 

In [None]:
# print the first row of all the games with zero rating
print(games[games["average_rating"] == 0].iloc[0])

# print the first row of all the games with greater than zero rating
print(games[games["average_rating"] > 0].iloc[0])

 ### Insights:
 * If we look into the first row with zero rating it has no user rated that's why it has zero rating
 * whereas in the first row with greater than zero rating has 20113 users rated this. 
 * This feature really gives us a great detail about a game, if there is no user rating than it's better to remove that game

In [None]:
# Remove the games with zero users rated
games = games[games["users_rated"] > 0]

# As we have very less number of missing values so let's get rid of those rows by dropping them
games = games.dropna(axis=0)

# Now let's see the shape after these preprocessings
print("Shape of the data: ", games.shape[0], "x", games.shape[1])

In [None]:
# Now let's see again the user average rating distribution
plt.hist(games["average_rating"])
plt.show()

### Insights:
Whoaaa!!! Voilaa!! Look at that '0' rating bar it is just reduced by almost 95% I guess. So this histogram looks much better than the previous one. This looks somehow in normal distribution.

In [None]:
# clean the dataframe
# drop the id column
games = games.drop("id", axis='columns')
display(games.tail())

## correlation analysis

In [None]:
fig = plt.figure(figsize = (19, 8))
# correlation matrix
games_corr = games.corr()

# correlation plot
sns.heatmap(games_corr,
            xticklabels = games_corr.columns.values,
            yticklabels = games_corr.columns.values,
            square = True, annot = False
)


### Insights:
* The minplaytime and maxplaytime are highly correlated with playingtime
* minage negatively correlated with users_rated
* bayes_average_rating also does not say much, It is just the bayes of the rating

So all n all we need to filtering out the columns and have a good feature set before train the model

In [None]:
# list of columns
columns = games.columns.tolist()

# Target variable
target = games["average_rating"]

# filter columns to remove features that are not useful
columns = [c for c in columns if c not in ["bayes_average_rating", "average_rating", "type", "name"]]

# built dataframe with filtered columns
games = games[columns]
display(games.columns)
# display(target)

Now it's time for prepare dataset for training and testing

In [None]:
from sklearn.model_selection import train_test_split

# split data into train and test
train_games_X, test_games_X, train_games_y, test_games_y = train_test_split(games,target, 
                                                                            test_size=0.25, 
                                                                            random_state=42)

## Linear Regression


In [None]:
from sklearn.metrics import mean_squared_error

lin_reg = LinearRegression()

# fit the model
lin_reg.fit(train_games_X, train_games_y)

In [None]:
# making predictions for test set
predictions = lin_reg.predict(test_games_X)

# error between predictions and true values
display(mean_squared_error(predictions, test_games_y))

## Random Forest Regressor

In [None]:
rf_reg = RandomForestRegressor(n_estimators = 100, min_samples_leaf=10, random_state=42)

rf_reg.fit(train_games_X, train_games_y)

In [None]:
predictions_rf = rf_reg.predict(test_games_X)

# error calculation
mean_squared_error(predictions_rf, test_games_y)

## Conclusion:
* So Linear Regression provides a mean_squared error of about 2.08 while random forest regressor a nonlinear regressor provides much better result which is about 1.46.
* Well this is just directly on test set with out using any cross validation, also not with overfitting so there is a lot of room for improvement.
* But for now the clear winner here is random forest regressor.
* We can also test on a single data input (Unseen) to predict the rating.