In [None]:
%matplotlib inline

# Initial Data Cleaning and Exploration
Code for the initial data cleaning and exploration done before modeling   
_Author: Jimmy Charité_  
_Email: jimmy.charite@gmail.com_  
_Date: January 8, 2017_

## Directory & Packages

In [None]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

The default directory is the code subdirectory. Changing to the main repo directory above.

Converting the image type variable into numeric

In [None]:
pd_tab(raw_data,'image_type')

In [None]:
clean_data=raw_data.copy()

In [None]:
clean_data.replace({'image_type': {'nonad.':0,'ad.':1}},inplace=True)
clean_data.head()

Converting all other variables into numeric

In [None]:
clean_data=clean_data.apply(lambda row: pd.to_numeric(row,errors='coerce'))
clean_data.head()

Inspecting the 'Height' feature

In [None]:
clean_data[np.isnan(clean_data.height)==False].height.describe()

In [None]:
g=sns.distplot(clean_data[np.isnan(clean_data.height)==False].height)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Heights\n',fontsize=20)
g.set_xlabel('Height',fontsize=15)

In [None]:
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.height)==False].height))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Heights\n',fontsize=20)
g.set_xlabel('Height',fontsize=15)

Taking the log of the continuous variables can be an optional pipeline step during the model training stage. In theory, many of the parametric models like the logistic classifier benefit from (standardized) approximately symmetric distributions. 

Inspecting the 'Width' feature

In [None]:
clean_data[np.isnan(clean_data.width)==False].width.describe()

In [None]:
g=sns.distplot(clean_data[np.isnan(clean_data.width)==False].width)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Widths\n',fontsize=20)
g.set_xlabel('Width',fontsize=15)

In [None]:
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.width)==False].width))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Widths\n',fontsize=20)
g.set_xlabel('Width',fontsize=15)

Widths are bimodal and there isn't simple transformation to address it. I may experiment with using categorical variables for the width. 

Inspecting the aspect ratio feature

In [None]:
clean_data['missing_local']=clean_data.local.isnull()
clean_data['image_type'].groupby(clean_data['missing_local']).mean()*100

The local variable is missing for only 15 observations. Assuming this is representative of the general rate at which it is missing, I will simply impute missing values to '0'. In practice, if the 'local' variable turns out to be an extremely important feature, I would discuss the issue with individuals in the company that fully understand how the data was generated to see if there are smarter imputation approaches available or if the data collection process can be changed to avoid missing information on this variable. 

## Cleaning and Saving Data for Modeling

Starting with the raw data, I apply everything I learned from the data exploration to preparing the raw data for modeling. The code below will be re-used in the standalone python script that will be used to make predictions on new raw data.

### Upload Data

Main dataset

In [None]:
raw_data=pd.read_csv("./raw_data/data",header=None)
raw_data.head()

Attach column names 

In [None]:
col_names=pd.read_csv("./raw_data/column.names.txt",header=None,
                     sep=":")
col_names.columns=['variable','type']

In [None]:
col_names=pd.concat((col_names,
                     pd.DataFrame({'variable':['image_type'],
                                   'type':['0,1.'] })),axis=0)
col_names=col_names[['variable','type']]

In [None]:
raw_data.columns=list(col_names.variable)

### Make Numerical

In [None]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.width,errors='coerce'))]['width']
np.unique(temp)

In [None]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.aratio,errors='coerce'))]['aratio']
np.unique(temp)

In [None]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.local,errors='coerce'))]['local']
np.unique(temp)

The non-numerical values enter as "?"

### Inspecting the Continuous Variables

In [None]:
len(clean_data)-len(clean_data.dropna(axis=0,how='any'))

Counting missing instances by variable

In [None]:
temp=clean_data.isnull().sum().reset_index()
temp.columns=['variable','missing']
temp.sort_values(by='missing',inplace=True,ascending=False)
temp['percent']=np.round(100*temp['missing']/len(clean_data),2)

In [None]:
temp[temp.missing>0]

Missing values in the height, width, and aspect ratio

In [None]:
len(clean_data[(clean_data.height.isnull()==False) & 
           (clean_data.width.isnull()==False) &
           (clean_data.aratio.isnull()==True) ])

In [None]:
len(clean_data[(clean_data.height.isnull()==False) & 
           (clean_data.width.isnull()==True) &
           (clean_data.aratio.isnull()==False) ])

In [None]:
len(clean_data[(clean_data.height.isnull()==True) & 
           (clean_data.width.isnull()==False) &
           (clean_data.aratio.isnull()==False) ])

With the current data, it is not possible to impute missing data in one continuous variable with complete data from the other two continuous variables.

In [None]:
clean_data['missing_aratio']=clean_data.aratio.isnull()
clean_data['image_type'].groupby(clean_data['missing_aratio']).mean()*100

In [None]:
f='image_type ~ missing_aratio'
results = smf.glm(formula=f, data=clean_data, 
                  family=sm.families.Binomial()).fit()
print(results.summary())

16% of instances with missing aspect ratios are ads and 8% of instances without missing aspect ratios are ads. The difference, in a univariate regression, is statistically significant. 

In light of the large percent of missing values and the seemingly non-randomness of the missing values with respect to the feature being classified, I will represent the aspect ratio, height, and width as categorical variables with 'missing' being the reference category. 

In [None]:
clean_data[np.isnan(clean_data.aratio)==False].aratio.describe()

In [None]:
g=sns.distplot(clean_data[np.isnan(clean_data.aratio)==False].aratio)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,)
g.axes.set_title('Image Aspect Ratio\n',fontsize=20)
g.set_xlabel('Ratio',fontsize=15)

In [None]:
100*len(clean_data[clean_data.aratio>10])/len(clean_data) #1.25% 10+

In [None]:
g=sns.distplot(np.log(clean_data[np.isnan(clean_data.aratio)==False].aratio))
g.axes.set_ylim(0,)
g.axes.set_title('Logged Image Aspect Ratio\n',fontsize=20)
g.set_xlabel('Ratio',fontsize=15)

Taking the log of the aspect ratio improves the symmetry of the distribution, but it is not approximately normal one. On the non-logged scale, the aspect ratios greater than 10 definitely look like outliers, however, they are still within reason for aspect ratios for images. I created an image with an aspect ratio of 60 to confirm. Dropping outliers may result in an inability to make predictions for certain instances, which isn't always practical. Therefore, initially, I will include algorithms that are robust to outliers before experimenting with removing them.

### Inspecting the Binary Variables

In [None]:
data_types[data_types.d_type=='object']

In [None]:
temp=raw_data[np.isnan(pd.to_numeric(raw_data.height,errors='coerce'))]['height']
np.unique(temp)

Height

In [None]:
raw_data['height_cat']='height_NaN'
raw_data.ix[(raw_data.height>=0) & (raw_data.height<50), 'height_cat']='height_0t50'
raw_data.ix[(raw_data.height>=50) & (raw_data.height<100), 'height_cat']='height_50t100'
raw_data.ix[(raw_data.height>=100) & (raw_data.height<150), 'height_cat']='height_100t150'
raw_data.ix[(raw_data.height>=150) & (raw_data.height<200), 'height_cat']='height_150t200'
raw_data.ix[(raw_data.height>=200) & (raw_data.height<250), 'height_cat']='height_200t250'
raw_data.ix[(raw_data.height>=250) & (raw_data.height<300), 'height_cat']='height_250t300'
raw_data.ix[(raw_data.height>=300) & (raw_data.height<350), 'height_cat']='height_300t350'
raw_data.ix[(raw_data.height>=350) & (raw_data.height<400), 'height_cat']='height_350t400'
raw_data.ix[(raw_data.height>=400), 'height_cat']='height_400t'

In [None]:
height_cats=pd.get_dummies(raw_data['height_cat'])
del height_cats['height_NaN'] #comparison category
del raw_data['height_cat']
height_cats.head()

Width

In [None]:
raw_data['width_cat']='width_NaN'
raw_data.ix[(raw_data.width>=0) & (raw_data.width<50), 'width_cat']='width_0t50'
raw_data.ix[(raw_data.width>=50) & (raw_data.width<100), 'width_cat']='width_50t100'
raw_data.ix[(raw_data.width>=100) & (raw_data.width<150), 'width_cat']='width_100t150'
raw_data.ix[(raw_data.width>=150) & (raw_data.width<200), 'width_cat']='width_150t200'
raw_data.ix[(raw_data.width>=200) & (raw_data.width<250), 'width_cat']='width_200t250'
raw_data.ix[(raw_data.width>=250) & (raw_data.width<300), 'width_cat']='width_250t300'
raw_data.ix[(raw_data.width>=300) & (raw_data.width<350), 'width_cat']='width_300t350'
raw_data.ix[(raw_data.width>=350) & (raw_data.width<400), 'width_cat']='width_350t400'
raw_data.ix[(raw_data.width>=400), 'width_cat']='width_400t'

In [None]:
width_cats=pd.get_dummies(raw_data['width_cat'])
del width_cats['width_NaN'] #comparison category
del raw_data['width_cat']
width_cats.head()

Switching the categorical with binary variables

In [None]:
del raw_data['height'], raw_data['width'], raw_data['aratio']

In [None]:
raw_data=pd.concat([height_cats,width_cats,aspect_cats,raw_data], axis=1)
raw_data.head()

Without domain knowledge or clear business logic, turning continuous variables into a series of categorical variables is a mix of empiricism and guessing. I inspected the histograms and selected partitions that made sense. This part of the model building process can be refined through iteration.

### Saving Final Modeling Dataset

Normally I pickle datasets. However, to make the code more portable, I will save it as a csv.

In [None]:
temp=clean_data.mean().reset_index(name='Percent')
temp.columns=['Variable','Percent']
temp=temp[3:] #remove the continuous ones
temp['Percent']=np.round(temp['Percent']*100,2)
temp.sort_values(by='Percent',inplace=True,ascending=False)

In [None]:
temp.head()

In [None]:
temp.tail()

In [None]:
g=sns.distplot(temp.Percent)
g.axes.set_ylim(0,)
g.axes.set_xlim(0,100)
g.axes.set_title('Distribution of Sparsity of Binary Variables\n',fontsize=20)
g.set_xlabel('Percent of Affirmative/True Instances',fontsize=15)

In [None]:
len(temp[temp.Percent<10])/len(temp) #Percent of binary features less than 10% affirmative

In [None]:
len(temp[temp.Percent<1])/len(temp) #Percent of binary features less than 1% affirmative

In [None]:
len(temp[temp.Percent<.1])/len(temp)  #Percent of binary features less than 0.1% affirmative

The feature space for the binary variables is sparse: 99% of the binary variables are affirmative less than 10% of the time and 86% are affirmative less than 1% of the time. 

In addition to using algorithms robust to sparse features, I may experiment with cross-validation driven feature selection (like 'VarianceThreshold').

### Inspecting with Missing Values

Rows with all missing data

In [None]:
len(clean_data)-len(clean_data.dropna(axis=0,how='all'))

Rows with any missing data

In [None]:
col_names=pd.read_csv("./raw_data/column.names.txt",header=None,
                     sep=":")
col_names.head()

In [None]:
col_names.columns=['variable','type']

In [None]:
col_names.head()

In [None]:
raw_data.columns=list(col_names.variable)
raw_data.head()

## Data Quality

### Data Types

In [None]:
data_types=raw_data.dtypes.reset_index()
data_types.columns=['variable','d_type']

In [None]:
pd_tab(data_types,'d_type')

All the features will be made numerical.

### Non-Numerical Values

Non-numerical features

In [None]:
raw_data.ix[raw_data.local.isnull(), 'local']=0

### Make the Continuous Variables Categorical

Aspect Ratio

In [None]:
raw_data['aratio_cat']='aratio_NaN'
raw_data.ix[(raw_data.aratio>=0) & (raw_data.aratio<2), 'aratio_cat']='aratio_0t2'
raw_data.ix[(raw_data.aratio>=2) & (raw_data.aratio<4), 'aratio_cat']='aratio_2t4'
raw_data.ix[(raw_data.aratio>=4) & (raw_data.aratio<6), 'aratio_cat']='aratio_4t6'
raw_data.ix[(raw_data.aratio>=6) & (raw_data.aratio<8), 'aratio_cat']='aratio_6t8'
raw_data.ix[(raw_data.aratio>=8) & (raw_data.aratio<10), 'aratio_cat']='aratio_8t10'
raw_data.ix[(raw_data.aratio>=10), 'aratio_cat']='aratio_10t'

In [None]:
aspect_cats=pd.get_dummies(raw_data['aratio_cat'])
del aspect_cats['aratio_NaN'] #comparison category
del raw_data['aratio_cat']
aspect_cats.head()