In [2]:
# Down-sampling involves randomly removing observations from the majority class 
# to prevent its signal from dominating the learning algorithm.
# Most common method is resampling without replacement

# - First, we'll separate observations from each class into different DataFrames.
# - Next, we'll resample the majority class without replacement, setting the number 
# of samples to match that of the minority class.
# - Finally, we'll combine the down-sampled majority class DataFrame with 
# the original minority class DataFrame.

In [3]:
# load necessary python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [4]:
# module for resampling
from sklearn.utils import resample

In [5]:
# os.getcwd()

In [6]:
# os.listdir(os.getcwd())

In [7]:
# read the csv x_data file into Pandas 

all_param_df = pd.read_csv("..\\reduced_data\\all_param.csv")
all_param_df.shape


(39624, 351)

In [15]:
# separate classes in dataframes of success/failure
df_fail = all_param_df[all_param_df.suc_class==0]
df_succ = all_param_df[all_param_df.suc_class==1]

In [16]:
df_fail.shape

(38818, 351)

In [17]:
df_succ.shape

(806, 351)

In [21]:
# Downsample Majority class
df_fail_downsample = resample(df_fail, replace=False, n_samples=1000, random_state=2)

# combine success with downsampled majority class into new dataframe
df_downsampled = pd.concat([df_fail_downsample, df_succ])

# display new class counts
df_downsampled.suc_class.value_counts()

0    1000
1     806
Name: suc_class, dtype: int64

In [22]:
# # reindex the new upsampled dataframe
# df_upsampled.reindex()

In [23]:
# df_upsampled

In [24]:
# # read the csv y_data file into Pandas 
# y_parm_df = pd.read_csv("reduced_data\\y_params.csv")
# y_parm_df.shape

In [25]:
# using binary classification for logistic regression
y_class = df_downsampled['suc_class']

In [26]:
y_class

13716    0
4622     0
1350     0
30223    0
23274    0
        ..
39619    1
39620    1
39621    1
39622    1
39623    1
Name: suc_class, Length: 1806, dtype: int64

In [27]:
# create new X parameters df
x_data = df_downsampled.drop(['appid','maxccu','success_class','suc_class','followers']  , axis=1)

In [36]:
x_data.shape


(1806, 346)

In [40]:
# define training and test data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_data, y_class, random_state=5)

In [41]:
# baseline example
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [43]:
# Train model
y = y_train
X = X_train

clf_2 = LogisticRegression(max_iter=1000).fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [44]:
# Apply fitted model to test data - what accuracy?
print(f"Training Data Score: {clf_1.score(X_train, y_train)}")
print(f"Testing Data Score: {clf_1.score(X_test, y_test)}")

Training Data Score: 0.8751846381093058
Testing Data Score: 0.8805309734513275


In [45]:
# Understand precision, recall, f1 score with classification report for logistic classification
from sklearn.metrics import classification_report
predictions = clf_1.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       248
           1       0.87      0.87      0.87       204

    accuracy                           0.88       452
   macro avg       0.88      0.88      0.88       452
weighted avg       0.88      0.88      0.88       452



In [None]:
# thinking about baseline results:
# Support is balanced due to sampling adjustments
# Percision: for no success: 89%, for successful games: 87%
# Recall: Fraction of positives found, 89% for 0, 87% for 1
# f1 score: 88%