In [65]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, ARDRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Read in the data using Pandas. The result is stored in a Pandas Data Frame.
df = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")

# ----------------Cleaning the data of mistyped and missing values----------------------------

# Replacing string values in the following 4 features(columns) with NaNs.
features_to_be_cleaned = ['Critic_Score' , 'Critic_Count', 'User_Score', 'User_Count']

for i in features_to_be_cleaned:
    df[i] = df[i].apply(pd.to_numeric, errors='coerce')

# Replacing NaN values of numerical columns with the mean of the column
#df.fillna(df.mean(), inplace=True)

# Replacing NaN values of numerical columns with the median of the column
#df.fillna(df.median(), inplace=True)

# Dropping any rows in the entire database with NaN values.
df.dropna(inplace = True)

# Saving cleaned dataframe
df.to_csv('test.csv')

# Degree of the polynomial fit
degree = 2

# Value added to avoid log transformation errors (division by zero)
epsilon = 0.001

def feature_space():
    # One hot encoding categorical values of the following features(columns).
    enc = OneHotEncoder()
    
    # Normalizing numerical values
#     df['Critic_Score'] = normalization(df['Critic_Score'])
#     df['User_Score'] = normalization(df['User_Score'])
#     df['Critic_Count'] = normalization(df['Critic_Count'])
#     df['User_Count'] = normalization(df['User_Count'])
    
    f1 = df[['Critic_Score', 'User_Score', 'Critic_Count', 'User_Count']]
    f2 = enc.fit_transform(df[['Platform']]).toarray()
    f3 = enc.fit_transform(df[['Genre']]).toarray()
    f4 = enc.fit_transform(df[['Publisher']]).toarray()
    f5 = enc.fit_transform(df[['Developer']]).toarray()
    
    features = np.concatenate((f1, f2, f3, f4, f5), axis=1)
    
    predicted_value = df[['Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
    #predicted_value = df['Global_Sales']
    
    x = features
    y = predicted_value
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    
    # sel = VarianceThreshold(threshold=(0.1 * (1 - 0.1)))
    # x = sel.fit_transform(x)
    
    # Return 0 for train, 1 for test
    return x_train,y_train,1

def normalization(x):
    
    x_mean = np.mean(x)
    x_std = np.std(x)  # Standard deviation
    
    x = (x - x_mean)/(x_std + epsilon)
    
    return(x)
   
    
def linear(x,y,state):
    # Specify the model.
    model = LinearRegression(fit_intercept=True)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    
    if state == 0:
        print("Linear Regression Training Accuracy = ", R_sq)
    else:
        print("Linear Regression Testing Accuracy = ", R_sq)
    
    
def ridge(x,y,state):
    # Specify the model.
    model = Ridge(alpha=0.05)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    
    if state == 0:
        print("Ridge Regression Training Accuracy = ", R_sq)
    else:
        print("Ridge Regression Testing Accuracy = ", R_sq)
    
    
def lasso(x,y,state):
    # Specify the model.
    model = Lasso(alpha=0.05)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    
    if state == 0:
        print("Lasso Regression Training Accuracy = ", R_sq)
    else:
        print("Lasso Regression Testing Accuracy = ", R_sq)
    
    
def poly(x,y,state): # Doesn't work for multiple features (Error : Runs out of memory)
    # Specify the model.
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)

    if state == 0:
        print("Polynomial Regression Training Accuracy = ", R_sq)
    else:
        print("Polynomial Regression Testing Accuracy = ", R_sq)
    
    
def ARD(x,y,state): # Takes forever to train
    # Specify the model.
    model = ARDRegression(fit_intercept=True)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    
    if state == 0:
        print("ARD Regression Training Accuracy = ", R_sq)
    else:
        print("ARD Regression Testing Accuracy = ", R_sq)
    
    
def plot():
    
    """The predict function can be applied to a vector. Here we apply to it a
    sequence of evenly-spaced values corresponding to our x-axis."""
    
    xfit = np.linspace(140,230,600).reshape(-1,1)
    yfit = model.predict(xfit)

    # Plot the data
    plt.scatter(X,y)
    plt.title('Video game sales prediction')
    plt.xlabel('Idek')
    plt.ylabel('Global Sales')
    plt.plot(xfit,yfit)
    plt.show()

x,y,state = feature_space()
x = np.log(x + epsilon)
y = np.log(y + epsilon)

linear(x,y,state)
ridge(x,y,state)
lasso(x,y,state)
#poly(x,y,state)
#ARD(x,y,state)
#plot()

# TODO:
# Validation and test set

# Things tried:
# Label encoding 12%
# One hot 42 %
# Linear Regression 71%
# Multivariate linear regression
# Lasso
# Ridge
# Linear
# ARD
# Inflation check
# Multivariate
# Different feature combination
# mean, median for missing values
# feature selection based on low variance
# Polymomial fit
# Normalization
# log transformations
# Different log bases

Linear Regression Testing Accuracy =  0.7103204088184942
Ridge Regression Testing Accuracy =  0.7158075581059438
Lasso Regression Testing Accuracy =  0.5026923296999122


[[3876.]
 [5986.]
 [5840.]
 ...
 [1600.]
 [ 732.]
 [ 720.]]
