In [61]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from fast_ml.model_development import train_valid_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [119]:
# import data from dvc
import dvc.api

with dvc.api.open(
    'data/data_with_os.csv',
    mode='rb',
    
) as data:
    df_with_os = pd.read_csv(data)
    df_with_os.head()
df_with_os.drop(['auction_id'],axis=1,inplace=True)

In [120]:
# Change data types to category
df_with_os['platform_os']=df_with_os['platform_os'].astype('category')
df_with_os['user_response']=df_with_os['user_response'].astype('category')

In [41]:
categorical_column = df_with_os.select_dtypes(include=["object","category"]).columns.tolist()

# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical_column if df_with_os[col].nunique() <= 10 and df_with_os[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)

To One Hot Encoding: ['date']
To Label Encoding: ['experiment', 'device_make', 'platform_os', 'user_response']


In [44]:
one_hot_encoded_columns = pd.get_dummies(df_with_os[to_one_hot_encoding])
one_hot_encoded_columns

Unnamed: 0,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10
0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0
4,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...
1238,0,0,0,0,0,1,0,0
1239,0,0,0,0,0,0,1,0
1240,0,0,0,0,0,0,1,0
1241,0,0,0,0,0,0,0,1


In [48]:
# Label Encoding

label_encoded_columns = []
# For loop for each columns
for col in to_label_encoding:
    # We define new label encoder to each new column
    le = LabelEncoder()
    # Encode our data and create new Dataframe of it, 
    # notice that we gave column name in "columns" arguments
    column_dataframe = pd.DataFrame(le.fit_transform(df_with_os[col]), columns=[col] )
    # and add new DataFrame to "label_encoded_columns" list
    label_encoded_columns.append(column_dataframe)

# Merge all data frames
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
label_encoded_columns

Unnamed: 0,experiment,device_make,platform_os,user_response
0,1,13,1,1
1,0,43,1,1
2,0,13,1,1
3,1,65,1,1
4,1,13,1,1
...,...,...,...,...
1238,1,69,1,0
1239,1,13,1,0
1240,0,13,1,0
1241,0,13,1,0


In [54]:
# Copy our DataFrame to X variable
X = df_with_os.copy()

# Droping Categorical Columns,
# "inplace" means replace our data with new one
# Don't forget to "axis=1"
X.drop(categorical_column, axis=1, inplace=True)

# Merge DataFrames
X = pd.concat([X, one_hot_encoded_columns, label_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
X

All columns: ['hour', 'date_2020-07-03', 'date_2020-07-04', 'date_2020-07-05', 'date_2020-07-06', 'date_2020-07-07', 'date_2020-07-08', 'date_2020-07-09', 'date_2020-07-10', 'experiment', 'device_make', 'platform_os', 'user_response']


Unnamed: 0,hour,date_2020-07-03,date_2020-07-04,date_2020-07-05,date_2020-07-06,date_2020-07-07,date_2020-07-08,date_2020-07-09,date_2020-07-10,experiment,device_make,platform_os,user_response
0,16,0,1,0,0,0,0,0,0,1,13,1,1
1,4,0,0,0,0,0,1,0,0,0,43,1,1
2,15,1,0,0,0,0,0,0,0,0,13,1,1
3,13,0,0,0,0,0,0,1,0,1,65,1,1
4,6,0,0,1,0,0,0,0,0,1,13,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,17,0,0,0,0,0,1,0,0,1,69,1,0
1239,18,0,0,0,0,0,0,1,0,1,13,1,0
1240,7,0,0,0,0,0,0,1,0,0,13,1,0
1241,16,0,0,0,0,0,0,0,1,0,13,1,0


In [63]:
# # Define Y (This is the value we will predict)
# y = df_with_os["user_response"]

# # Droping "class" from X
# X.drop(["user_response"], axis=1, inplace=True)
y_train

240     1
616     0
777     0
586     0
892     0
       ..
1095    0
57      1
605     0
1131    0
156     1
Name: user_response, Length: 870, dtype: int64

In [76]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(X, target = 'user_response', 
                                                                            train_size=0.7, valid_size=0.2, test_size=0.1)
print(f"Train: {X_train.shape} {y_train.shape}")
print(f"Train: {X_valid.shape} {y_valid.shape}")
print(f"Train: {X_test.shape} {y_test.shape}")

Train: (870, 12) (870,)
Train: (248, 12) (248,)
Train: (125, 12) (125,)


In [121]:
# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=50)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test.values[0:10])

Predicted: [1 1 0 0 0 0 0 0 0 0]
Actual: [0 1 0 0 0 0 1 0 1 0]


In [117]:
cm_rf = confusion_matrix(y_test, pred_rf)
print(cm_rf)

[[47 18]
 [34 26]]


In [118]:
accuracy_score(y_test,pred_rf)

0.584