In [96]:
import pandas as pd
from datetime import datetime
from sklearn import preprocessing

## Cleaning and Preprocessing Datasets

In [97]:
df1 = pd.read_csv("../data/original/online_shoppers_intention.csv")
target1 = 'Revenue'

# Fixing Online Shopping Intention Dataset

# ORDINALENCODING CATEGORICAL FEATURES
df1[target1] = df1[target1].astype(int)

d = {'Jan':1, 'Feb':2, 'Mar':3, 'Apr':4, 'May':5, 'June':6, 'Jul':7, 'Aug':8, 'Sep':9, 'Oct':10, 'Nov':11, 'Dec':12}
df1['Month'] = df1['Month'].map(d)

df1['Weekend'] = df1['Weekend'].astype(int)

d = {'New_Visitor':0, 'Returning_Visitor':1, 'Other':2}
df1['VisitorType'] = df1['VisitorType'].map(d)

# STANDARDIZING NUMERICAL FEATURES
numerical_features = ["Administrative", "Administrative_Duration", "Informational", "Informational_Duration", 
                      "ProductRelated", "ProductRelated_Duration", "BounceRates", "ExitRates", "PageValues", "SpecialDay"]
scaler = preprocessing.StandardScaler().fit(df1[numerical_features])
df1[numerical_features] = scaler.transform(df1[numerical_features])

# SAVING
df1.to_csv('../data/processed/online_shoppers_intentions.csv')
df1

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,2,1,1,1,1,1,0,0
1,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.590903,-0.457683,1.171473,-0.317178,-0.308821,2,2,2,1,2,1,0,0
2,-0.696993,-0.457191,-0.396478,-0.244931,-0.691003,-0.624348,3.667189,3.229316,-0.317178,-0.308821,2,4,1,9,3,1,0,0
3,-0.696993,-0.457191,-0.396478,-0.244931,-0.668518,-0.622954,0.573535,1.994610,-0.317178,-0.308821,2,3,2,2,4,1,0,0
4,-0.696993,-0.457191,-0.396478,-0.244931,-0.488636,-0.296430,-0.045196,0.142551,-0.317178,-0.308821,2,3,3,1,4,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12325,0.206173,0.363075,-0.396478,-0.244931,0.478227,0.307822,-0.310366,-0.288966,0.342125,-0.308821,12,4,6,1,1,1,1,0
12326,-0.696993,-0.457191,-0.396478,-0.244931,-0.601062,-0.380957,-0.457683,-0.447364,-0.317178,-0.308821,11,3,2,1,8,1,1,0
12327,-0.696993,-0.457191,-0.396478,-0.244931,-0.578577,-0.528063,1.261014,0.897093,-0.317178,-0.308821,11,3,2,1,13,1,1,0
12328,0.507228,-0.032916,-0.396478,-0.244931,-0.376210,-0.443536,-0.457683,-0.453140,-0.317178,-0.308821,11,2,2,3,11,1,0,0


In [98]:
df2 = pd.read_csv("../data/original/marketing_campaign.csv", sep='\t')
target2 = 'Teenhome'

# Fixing Marketing Campaign Dataset

# Z_CostContact and Z_Revenue never change, therefore they are not good features for prediction.
# Since ID is unique in each column, we drop it as well.
df2 = df2.drop(columns=['Z_CostContact','Z_Revenue'])
df2 = df2.drop(columns=['ID'])

# Filling empty instances
df2 = df2.fillna(df2['Income'].mean())

# Converting date to day_count variable
first = datetime(2012,7,30,0,0,0)
temp = []
for value in df2['Dt_Customer']:
    date = value.split('-')
    this = datetime(int(date[2]), int(date[1]), int(date[0]), 0, 0, 0)
    temp.append((this-first).days)
df2['Dt_Customer'] = temp

# Regrouping Marital Status
df2['Marital_Status'] = df2['Marital_Status'].map({'Married':'Married','Together':'Together','Single':'Single','Divorced':'Divorced','Widow':'Widow','Alone': 'Single', 'YOLO':'Unknown', 'Absurd':'Unknown'})

# ORDINALENCODING CATEGORICAL FEATURES

df2[target2] = df2[target2].replace([1,2], 1)

d = {'Unknown':0, 'Widow':1, 'Divorced':2, 'Single':3, 'Together':4, 'Married':5}
df2['Marital_Status'] = df2['Marital_Status'].map(d)

d = {'Basic':0, 'Graduation':1, '2n Cycle':2, 'Master':3, 'PhD':4}
df2['Education'] = df2['Education'].map(d)


# STANDARDIZING NUMERICAL FEATURES
numerical_features = ['Income','MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 
                      'MntGoldProds', 'Year_Birth','Recency','NumDealsPurchases','NumWebPurchases','NumCatalogPurchases',
                      'NumStorePurchases','NumWebVisitsMonth','Dt_Customer']
scaler = preprocessing.StandardScaler().fit(df2[numerical_features])
df2[numerical_features] = scaler.transform(df2[numerical_features])

# SAVING
df2.to_csv('../data/processed/marketing_campaign.csv')
df2

Unnamed: 0,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,...,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Response
0,-0.985345,1,3,0.235327,0,0,-1.531185,0.307039,0.983781,1.551577,...,2.510890,-0.550785,0.693904,0,0,0,0,0,0,1
1,-1.235733,1,3,-0.235826,1,1,1.190545,-0.383664,-0.870479,-0.636301,...,-0.568720,-1.166125,-0.130463,0,0,0,0,0,0,0
2,-0.317643,1,4,0.773633,0,0,0.205773,-0.798086,0.362723,0.570804,...,-0.226541,1.295237,-0.542647,0,0,0,0,0,0,0
3,1.268149,1,4,-1.022732,1,0,1.061881,-0.798086,-0.870479,-0.560857,...,-0.910898,-0.550785,0.281720,0,0,0,0,0,0,0
4,1.017761,4,5,0.241519,1,0,0.953012,1.550305,-0.389085,0.419916,...,0.115638,0.064556,-0.130463,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2235,-0.150717,1,5,0.358568,0,1,-0.135680,-0.107383,1.203678,0.419916,...,0.115638,-0.550785,-0.130463,0,0,0,0,0,0,0
2236,-1.903435,4,4,0.470064,2,1,1.655713,0.237969,0.303291,-0.661449,...,-0.226541,-0.243114,0.693904,0,0,0,1,0,0,0
2237,1.017761,1,2,0.189106,0,0,0.982703,1.446700,1.795020,0.545656,...,0.115638,2.218248,0.281720,0,1,0,0,0,0,0
2238,-1.068807,3,4,0.679035,0,1,0.977755,-1.419719,0.368666,0.092992,...,0.799996,1.295237,-0.954831,0,0,0,0,0,0,0


In [101]:
df3 = pd.read_csv("../data/original/heart.csv")
target3 = 'target'

# Fixing Heart Dataset

# STANDARDIZING NUMERICAL FEATURES
numerical_features = ['trestbps','chol','thalach','oldpeak','age']
scaler = preprocessing.StandardScaler().fit(df3[numerical_features])
df3[numerical_features] = scaler.transform(df3[numerical_features])

# From Assignment 3, we achieve best accuracy scores with no feature selection (ie. with all features).

df3.to_csv('../data/processed/heart.csv')
df3

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0.952197,1,3,0.763956,-0.256334,1,0,0.015443,0,1.087338,0,0,1,1
1,-1.915313,1,2,-0.092738,0.072199,0,1,1.633471,0,2.122573,0,0,2,1
2,-1.474158,0,1,-0.092738,-0.816773,0,0,0.977514,0,0.310912,2,0,2,1
3,0.180175,1,1,-0.663867,-0.198357,0,1,1.239897,0,-0.206705,2,0,2,1
4,0.290464,0,0,-0.663867,2.082050,0,1,0.583939,1,-0.379244,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,0.290464,0,0,0.478391,-0.101730,0,1,-1.165281,1,-0.724323,1,0,3,0
299,-1.033002,1,3,-1.234996,0.342756,0,1,-0.771706,0,0.138373,1,0,3,0
300,1.503641,1,0,0.706843,-1.029353,1,1,-0.378132,0,2.036303,1,2,3,0
301,0.290464,1,0,-0.092738,-2.227533,0,1,-1.515125,1,0.138373,1,1,3,0
