In [17]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, ARDRegression, Ridge, Lasso
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

# Read in the data using Pandas. The result is stored in a Pandas Data Frame.
df = pd.read_csv("Video_Games_Sales_as_at_22_Dec_2016.csv")

# ----------------Cleaning the data of mistyped and missing values----------------------------

# Replacing string values in the following 4 features(columns) with NaNs.
features_to_be_cleaned = ['Critic_Score' , 'Critic_Count', 'User_Score', 'User_Count']

for i in features_to_be_cleaned:
    df[i] = df[i].apply(pd.to_numeric, errors='coerce')

# Replacing NaN values of numerical columns with the mean of the column
#df.fillna(df.mean(), inplace=True)

# Replacing NaN values of numerical columns with the median of the column
#df.fillna(df.median(), inplace=True)

# Dropping any rows in the entire database with NaN values.
df.dropna(inplace = True)

df.to_csv('test.csv')

# Degree of the polynomial fit
degree = 2

#
epsilon = 0.0001

def feature_space():
    
    # One hot encoding categorical values of the following features(columns).
    enc = OneHotEncoder()
    
    # Normalizing numerical values
#     df['Critic_Score'] = normalization(df['Critic_Score'])
#     df['User_Score'] = normalization(df['User_Score'])
#     df['Critic_Count'] = normalization(df['Critic_Count'])
#     df['User_Count'] = normalization(df['User_Count'])
    
    f1 = df[['Critic_Score', 'User_Score', 'Critic_Count', 'User_Count']]
    f2 = enc.fit_transform(df[['Platform']]).toarray()
    f3 = enc.fit_transform(df[['Genre']]).toarray()
    f4 = enc.fit_transform(df[['Publisher']]).toarray()
    f5 = enc.fit_transform(df[['Developer']]).toarray()
    
    features = np.concatenate((f1, f2, f3, f4, f5), axis=1)
    
    predicted_value = df[['Global_Sales', 'NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']]
    #predicted_value = df['Global_Sales']
    
    x = features
    y = predicted_value
    
    # sel = VarianceThreshold(threshold=(0.1 * (1 - 0.1)))
    # x = sel.fit_transform(x)
    
    return x,y

def normalization(x):
    
    x_mean = np.mean(x)
    x_std = np.std(x)
    
    x = (x - x_mean)/(x_std + epsilon)
    
    return(x)
   
    
def linear(x,y):
    # Specify the model.
    model = LinearRegression(fit_intercept=True)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    print("Linear Regression Accuracy = ", R_sq)
    
    
def ridge(x,y):
    # Specify the model.
    model = Ridge(alpha=0.05)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    print("Ridge Regression Accuracy = ", R_sq)
    
    
def lasso(x,y):
    # Specify the model.
    model = Lasso(alpha=0.05)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    print("Lasso Regression Accuracy = ", R_sq)
    
    
def poly(x,y):
    # Specify the model.
    model = make_pipeline(PolynomialFeatures(degree), LinearRegression())

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    print("Polynomial Regression Accuracy = ", R_sq)
    
    
def ARD(x,y):
    # Specify the model.
    model = ARDRegression(fit_intercept=True)

    # Fit the data to the model.
    model.fit(x,y)

    # R-squared.
    R_sq = model.score(x,y)
    print("ARD Regression Accuracy = ", R_sq)
    
    
def plot():
    
    """The predict function can be applied to a vector. Here we apply to it a
    sequence of evenly-spaced values corresponding to our x-axis."""
    
    xfit = np.linspace(140,230,600).reshape(-1,1)
    yfit = model.predict(xfit)

    # Plot the data
    plt.scatter(X,y)
    plt.title('Video game sales prediction')
    plt.xlabel('Idek')
    plt.ylabel('Global Sales')
    plt.plot(xfit,yfit)
    plt.show()

x,y = feature_space()
x = x + epsilon
y = y + epsilon

linear(np.log(x),np.log(y))
ridge(x,y)
lasso(x,y)
#poly(x,y)
#ARD(x,y)
#plot()

# TODO:
# Cross validation

# Things tried:
# Label encoding 12%
# One hot 42 %
# Linear Regression 66%
# Multivariate linear regression
# Lasso
# Ridge
# Linear
# ARD
# Inflation check
# Multivariate
# Different feature combination
# mean, median for missing values
# feature selection based on low variance
# Polymomial fit
# Normalization

Linear Regression Accuracy =  0.6603138387685868
Ridge Regression Accuracy =  0.41464823571760834
Lasso Regression Accuracy =  0.16201448985404673


[8.49977843 5.52672557 2.44570046 9.42854847 5.47863217 2.39760706
 8.40359162 5.43053876 2.34951365 8.35549821 5.38244536 2.30142024] 0
[[ 8.80681818]
 [ 5.92613636]
 [ 2.67613636]
 [10.        ]
 [ 5.64204545]
 [ 2.39204545]
 [ 8.23863636]
 [ 5.35795455]
 [ 2.10795455]
 [ 7.95454545]
 [ 5.07386364]
 [ 1.82386364]] []


  y = column_or_1d(y, warn=True)


[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 1.]]
