# Modeling

In [1]:
# importing libraries
import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from sklearn import preprocessing
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
warnings.filterwarnings("ignore")

# adding scripts to path
sys.path.append(os.path.abspath("../scripts/python"))

In [2]:
# loading scripts
from connection_manager import Manager
from modeling_utils import Modeling_Utils
man = Manager()
util = Modeling_Utils()

In [3]:
# establishing connection

connection, cursor = man.connect_to_server(host="localhost", port=5432, user="warehouse", password="warehouse", dbName="warehouse")

successfully connected; cursor: <cursor object at 0x7fc7da5dec70; closed: 0>


In [4]:
# fetching data
data = man.fetch_data(conn=connection, limit=10000)
data.head(5)

Unnamed: 0,campaign_id,types,width,height,creative_id,auction_id,browser_ts,game_key,geo_country,site_name,...,volume_agreed,gross_cost_or_budget,agency_fee,percentages,net_cost,design_feature,feature_type,feature_variety,sub_feature,feature_value
0,jmu9ci8,impression,480,480,akii53au,43afc590d3ae4fd98139aa387691d02b,2021-05-27 02:00:09,adunit-hitmans-wifes-body-guard-user-choice-v1...,United States,com.loop.match3d,...,214285.71,75000.0,Percentage,0.0,75000,,,,,
1,jmu9ci8,first_dropped,480,480,akii53au,2658819c852f45acb4442d51dd7273b3,2021-05-27 02:00:13,adunit-hitmans-wifes-body-guard-user-choice-v1...,United States,com.tripledot.solitaire,...,214285.71,75000.0,Percentage,0.0,75000,,,,,
2,jmu9ci8,impression,250,250,xxi0rxke,1e1f9502ec674ceea675e6f2a0630b2c,2021-05-27 02:00:15,adunit-hitmans-wifes-body-guard-user-choice-mp...,United States,www.powerstroke.org,...,214285.71,75000.0,Percentage,0.0,75000,,,,,
3,jmu9ci8,impression,480,480,2zz4r3bk,dd0ec25238954fe8ba341696f9f10dab,2021-05-27 02:00:20,adunit-hitmans-wifes-body-guard-user-choice-v1...,United States,1514542157,...,214285.71,75000.0,Percentage,0.0,75000,,,,,
4,jmu9ci8,impression,250,250,xxi0rxke,605fb016d0bd48ca870942a898dd979f,2021-05-27 02:00:32,adunit-hitmans-wifes-body-guard-user-choice-mp...,United States,ball-pythons.net,...,214285.71,75000.0,Percentage,0.0,75000,,,,,


In [5]:
# checking data
print(f" There are {data.shape[0]} rows and {data.shape[1]} columns")
util.summ_columns(data)

 There are 10000 rows and 36 columns


Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,campaign_id,0,0.0,object,10
1,types,0,0.0,object,3
2,width,0,0.0,object,2
3,height,0,0.0,object,2
4,creative_id,0,0.0,object,36
5,auction_id,0,0.0,object,10000
6,browser_ts,0,0.0,datetime64[ns],8328
7,game_key,0,0.0,object,24
8,geo_country,0,0.0,object,3
9,site_name,0,0.0,object,1063


## Automated Data Cleaning

In [6]:
# apply variable remover
data2 = util.reduce_dim_missing(data, 30)
util.summ_columns(data2)

Unnamed: 0,variables,missing_count,missing_percent_(%),data_type,unique_values
0,campaign_id,0,0.0,object,10
1,types,0,0.0,object,3
2,width,0,0.0,object,2
3,height,0,0.0,object,2
4,creative_id,0,0.0,object,36
5,auction_id,0,0.0,object,10000
6,browser_ts,0,0.0,datetime64[ns],8328
7,game_key,0,0.0,object,24
8,geo_country,0,0.0,object,3
9,site_name,0,0.0,object,1063


In [None]:
# applying the mode based missing filler
data3 = util.fill_missing_by_mode(data2)
util.summ_columns(data3)

In [None]:
# apply mean based missing value filler
data4 = util.fill_missing_by_mean(data3)
util.summ_columns(data4)

In [None]:
data4.head()

## Automated Filtering

In [None]:
# remove those with high correlation
# label encoding
def labeler(df):
    df = df.copy()
    le = preprocessing.LabelEncoder()
    for x in df.columns:
        if df[x].dtypes=='object':
            df[x]=le.fit_transform(df[x].astype(str))

    return df, le

def remove_correlated(df, th):
    """
    removes highly correlated variables from a dataframe.
    Args:
        df: a features dataframe that holds the variables
        th: a threshold correlation value to decide which variables to remove
    Return:
        features_df: a new features dataframe with low correlation values. 
    """
    try:
        df = df.copy()
        df2, le = labeler(df)
        corrmat = df2.corr()
        correlated_features = set()
        for i in range(len(corrmat.columns)):
            for j in range(i):
                if abs(corrmat.iloc[i, j]) > th:
                    colname = corrmat.columns[i]
                    correlated_features.add(colname)

        print(f"number of correlated variables: {len(correlated_features)}")
        print("..................................................")
        print("correlated features: ", correlated_features)

        features_df = df.drop(labels=correlated_features, axis=1)

        #logger.info("correlated variables successfully removed")

        return features_df

    except:
        #logger.warning("could not remove highly correlated variables")
        pass

In [None]:
# apply correlation based variable remover
data5 = remove_correlated(data4, 0.5)
pro.summ_columns(data5)

In [None]:
# remove the blacklist variables (ids, dates, etc...)
blacklist = ["game_key", "browser_ts", "creative_id", 
            "auction_id", "campaign_id", "campaign_name", 
            "descriptions", "kpis", "black_white_audience", "submission_date"]

def remove_cols(df, cols, keep=False):
    """
    a functions that removes specified columns 
    from dataframe or their inverse
    """
    df = df.copy()
    if(keep):
        r_df = df.loc[:,cols]
    else:
        r_df = df.drop(cols, axis=1, errors='ignore')

    return r_df

In [None]:
# apply blacklist remover
data6 = remove_cols(data5, blacklist)
pro.summ_columns(data6)

In [None]:
data6 = data6.sample(5000)
data6.head()

## Automated Prepration

In [None]:
# identify feature and target variables
# perform nlp and one hot incoding to prepare the features.
target = data6["types"]
features = remove_cols(data6, ["types"])
features.head()

In [None]:
# use nlp on categorical variables to tokenize them
# first lets proess the text based features
import re

def clean_text(column):

    processed_feature = []

    for sentence in column:
        # Remove all the special characters
        processed = re.sub(r'\W', ' ', str(sentence))

        # remove all single characters
        processed= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed)

        # Remove single characters from the start
        processed = re.sub(r'\^[a-zA-Z]\s+', ' ', processed) 

        # Substituting multiple spaces with single space
        processed = re.sub(r'\s+', ' ', processed, flags=re.I)

        # Removing prefixed 'b'
        processed = re.sub(r'^b\s+', '', processed)

        # Converting to Lowercase
        processed = processed.lower()

        processed_feature.append(processed)
    
    return processed_feature


In [None]:
# automatic object cleaner.
def process_features(df, cols=None):
    """
    fills missing values by mode
    """
    df = df.copy()
    mod_fill_list = []
    if(cols == None):
        temp = summ_columns(df)
        for i in range(temp.shape[0]):
            if(temp.iloc[i,3] == "object"):
                cleaned_feature = clean_text(df.iloc[:,i])
                df[temp.iloc[i,0]] = cleaned_feature

    
    return df

In [None]:
features2 = process_features(features)
features2.head()

In [None]:
features3, enco = labeler(features2)
features3.head()

In [None]:
# splitting training and testing batch
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features3, target, test_size=0.2, random_state=0)

## Training and Testing

In [None]:
# random forest training
# train the model 
from sklearn.ensemble import RandomForestClassifier

text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

In [None]:
# testing 
# test the model
predictions = text_classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

In [None]:
# feature importance
import io
from PIL import Image
from collections import OrderedDict
from operator import itemgetter
def get_importance(model, df):
    """
    it takes a regression model: model
    it takes a dataframe: df
    it returns a dictionary of importance scores: sortedx
    """
    features = df.columns.to_list()
    importance = model.feature_importances_
    imp_dict = {}
    for i in range(len(features)):
        imp_dict[features[i]] = importance[i]

    imp_dict = OrderedDict(sorted(imp_dict.items(), key=itemgetter(1)))
    features_i = list(imp_dict.keys())
    importance2 = list(imp_dict.values())
    plt.figure(figsize=(16,5))
    pal = sns.color_palette("flare", as_cmap=True)
    ax = sns.barplot(features_i, importance2, color="darkgreen")
    ax.invert_xaxis()
    locs, labels = plt.xticks()
    plt.setp(labels, rotation=45)
    img_buf = io.BytesIO()
    plt.savefig(img_buf, format='png')

    im = Image.open(img_buf)
    im.show(title="My Image")

    img_buf.close()
    imp_dict
    return img_buf, imp_dict

#features, target= DT.target_feature(train_cl, 0)

imp_image, importance = get_importance(text_classifier, features3)




In [None]:
max_depth = 10
max_features = 0.75
n_estimators = 200

mod, par, metr = train(features3, target, max_depth, max_features, n_estimators)


In [None]:
# saving model and encoder.
ml_track(enco, mod, par, metr)