**End goal: to predict the class (g or h)**


In [None]:
# import necessary packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [None]:
# import the dataset and set the column names

cols_name = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']

df = pd.read_csv('./magic04.data', names=cols_name)

In [None]:
# explore the dataset

df.head()

**Engineer the class column**


In [None]:
# get the unique values of the class fields

df['class'].unique()

In [None]:
# convert all values of g to 1 and h to 0

df['class'] = (df['class'] == 'g').astype(int)

In [None]:
# explore the dataset

df.head()

**Data visualization**


In [None]:
# plot a graph of the fields against the class since that's what i want my model to predict

for label in cols_name[:-1]:
    # alpha is transparency
    # density is to normalize the distribution (distribute over all the samples available)
    plt.hist(df[df['class'] == 1][label], color='blue',
             label='gamma', alpha=0.7, density=True)
    plt.hist(df[df['class'] == 0][label], color='red',
             label='gamma', alpha=0.7, density=True)
    plt.title(label)
    # used probability cause can either be 1 (h) or 0 (g)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

**Splitting my dataset to train, validate and test**

*train = 60%
validate = 20%
test = 20%*

In [None]:
df_len = len(df)

train, validate, test = np.split(df.sample(frac=1), [int(0.6 * df_len), int(0.2 * df_len)]) # df.sample(frac=1) to shuffle dataset

once we sample the dataFrame we should note that the data are far apart 
hence there's a need to scale the dataset so that data should be relative to the mean and the standard deviation

In [None]:
def scale_dataset(dataFrame, overSample = False):
    X = dataFrame[dataFrame.columns[:-1]].values # this are the columns needed for the prediction
    y = dataFrame[dataFrame.columns[-1]].values # this is the predicted column

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if overSample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y) # take more of the less class and keep sampling from there(pad it) to fit the more class

    # hstack = horizontal stack means place side by side
    # since X has been modified there's a need to reshape Y hence np.reshape()
    # np.reshape(y, (-1, 1)) -> makes this a 2D array and the -1 means to infer the dimension of the value i.e len(y) 
    data = np.hstack((X, np.reshape(y, (-1, 1))))

    return data, X, y

**Explore the data for 0 and 1 on train dataset**

In [None]:
print(len(train[train['class'] == 1])) # gamma
print(len(train[train['class'] == 0]))

from this we can see that we've more 1s that 0s

this might make the train data bias hence

we need to over-sample to that the data matches 

In [None]:
train, Xtrain, ytrain = scale_dataset(train, overSample=True)