### The aim of this kernel is to build a model that predicts the best facebook ad campaign for a given customer. 
#### There will be feature engineering, missing value imputation, and visualisations

In [None]:
import numpy as np # linear algebra
import matplotlib.pyplot as plt # For plotting
import pandas as pd
import seaborn as sns
%matplotlib inline
pd.options.display.max_rows = 1000

In [None]:
df = pd.read_csv("../input/data.csv")

## Restructuring --- part of  preprocessing 

In [None]:

#split the data to two dataframes - df2 with missing values
df1 = df[0:761]
df2 = df[761:]
# restructure the df by shifting the columns to match between df1 and df2
c = list(df2)
for x in range(12):
    c[x+1] = c[x+3]
    
df2.columns = c
# further restructuring
df2 = df2.iloc[:, :-2]
df2.rename(columns={'campaign_id': 'reporting_start','fb_campaign_id': 'reporting_end'}, inplace=True)

df2.insert(3, 'campaign_id',np.NaN)
df2.insert(4,'fb_campaign_id',np.NaN)

df2.head()

In [None]:
df = df1.append(df2, ignore_index=True) # final dataframe 
df.head(100) 

df = df1.append(df2, ignore_index=True) # final dataframe 
df.head() 

### Feature Engineering

In [None]:
import datetime
#see how long the campaign durations have been
df['reporting_start'] = pd.to_datetime(df['reporting_start'] )
df['reporting_end'] = pd.to_datetime(df['reporting_end'] )

df['campaign_duration']= df['reporting_start']-df['reporting_end']

df['campaign_duration'].value_counts()

##### Since all of the campaigns lasted within one day I  have decided to remove the variables as they do not offer any information gain

In [None]:
df.drop(['campaign_duration','reporting_start','reporting_end'],inplace=True,axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df['gender']=lb.fit_transform(df['gender']) # label encode gender

In [None]:
df['total_conversion'] = df['total_conversion'].astype(int) # change these variables to the proper format of an integer
df['approved_conversion'] = df['approved_conversion'].astype(int)
df['impressions'] = df['impressions'].astype(int)

In [None]:
df = pd.concat([df,pd.get_dummies(df['age'],prefix='age')],axis=1) # get dummies for age 
df.drop('age',inplace=True,axis=1)

In [None]:
# The dataframe should be solid now. 
df.head()

### Let's look for missing data ..................

### The two column values of campaign_id and fb_campaign_id are missing. As campaign_id has only three distinct value it will be a fundamental part of the add bidding model. 

In [None]:
#sns.countplot(df["campaign_id"])


In [None]:
# imputation of missing values
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, IterativeImputer, BiScaler
X = pd.DataFrame(KNN(k=3).fit_transform(df))
X.columns = df.columns
X.index = df.index



In [None]:
X['campaign_id'] = X['campaign_id'].astype(int)
X['fb_campaign_id'] = X['fb_campaign_id'].astype(int)
X['campaign_id'] = X['campaign_id'].replace(1177,1178) 

In [None]:
sns.countplot(X["campaign_id"])#


In [None]:
#sns.countplot(df["campaign_id"])

### Visualisation

#### Now that the dataframe is restructured - let's identify some patterns in the data.

In [None]:
def distComparison(df1, df2): # A function to see the distribution of each feature
    a = len(df1.columns)
    if a%2 != 0:
        a += 1
    
    n = np.floor(np.sqrt(a)).astype(np.int64)
    
    while a%n != 0:
        n -= 1
    
    m = (a/n).astype(np.int64)
    coords = list(itertools.product(list(range(m)), list(range(n))))
    
    numerics = df1.select_dtypes(include=[np.number]).columns
    cats = df1.select_dtypes(include=['category']).columns
    
    fig = plt.figure(figsize=(15, 15))
    axes = gs.GridSpec(m, n)
    axes.update(wspace=0.25, hspace=0.25)
    
    for i in range(len(numerics)):
        x, y = coords[i]
        ax = plt.subplot(axes[x, y])
        col = numerics[i]
        sns.kdeplot(df1[col].dropna(), ax=ax, label='df').set(xlabel=col)
        sns.kdeplot(df2[col].dropna(), ax=ax, label='df_missing')
        
    for i in range(0, len(cats)):
        x, y = coords[len(numerics)+i]
        ax = plt.subplot(axes[x, y])
        col = cats[i]

        df1_temp = df1[col].value_counts()
        df2_temp = df2[col].value_counts()
        df1_temp = pd.DataFrame({col: df1_temp.index, 'value': df1_temp/len(df1), 'Set': np.repeat('df1', len(df1_temp))})
        df2_temp = pd.DataFrame({col: df2_temp.index, 'value': df2_temp/len(df2), 'Set': np.repeat('df2', len(df2_temp))})

        sns.barplot(x=col, y='value', hue='Set', data=pd.concat([df1_temp, df2_temp]), ax=ax).set(ylabel='Percentage')

In [None]:
import itertools
import matplotlib.gridspec as gs

df_missing= X[761:]
df_not_missing= X[0:761]

distComparison(df_not_missing, df_missing)


conclusions: 
### The age distribution is fairly same in both datasets, df_missing having proportionally more younger people. The gender is interestingly split. Df_missing consists proportionally more of females and df proportionally more of males. 


## Model Fitting

In [None]:
df = X.copy()

df.spent=df.spent.astype(int)
df.interest1=df.interest1.astype(int)
df.interest2=df.interest2.astype(int)
df.interest3=df.interest3.astype(int)
df.campaign_id=df.campaign_id.astype('category')

df.dtypes

In [None]:
df.dropna(inplace=True)

df['approved_conversion'] = df['approved_conversion'].replace([range(2,22)], 1) 

df.approved_conversion=df.approved_conversion.astype('category')
df['approved_conversion'].value_counts()

## to have class balance and for the purpose of the add-bidding model. 

In [None]:
from sklearn.metrics import  classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE


In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:

col= [['interest1','interest2','interest3']]
for cols in col:
    df[cols] = np.log(df[cols])
    df[cols] = np.log(df[cols])

In [None]:
X = df[[ 'campaign_id','interest1','interest2','interest3','gender','age_30-34','age_35-39','age_40-44','age_45-49']]
y = df['approved_conversion']

In [None]:


#from imblearn.combine import SMOTETomek

# smt = SMOTETomek(ratio='auto')
# X, y = smt.fit_sample(X, y)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)


logmodel = LogisticRegression()
logmodel= RFE(logmodel, 9)
logmodel.fit(X_train,y_train)

In [None]:
predictions = logmodel.predict(X_test)
print(classification_report(y_test,predictions))


## Add-bidding Model

In [None]:
X_test[:10]  # Here is the testing data that the model hasn't seen before. 

In [None]:
# model's prediction for the first 10 rows of test data
logmodel.predict(X_test[:10])

In [None]:
#Prediction for the first row
X1 = X_test[:1]
X1
logmodel.predict(X1)