In [361]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


train = pd.read_csv("train.csv")
campaign_data = pd.read_csv('campaign_data.csv')
train['date'] = pd.to_datetime(train['send_date'], format = "%d-%m-%Y %H:%M")

In [362]:
# split data on 75/25 split based on date
split_date = min(train['date']) + pd.Timedelta(days = 115)
test = train.loc[train['date'] >= split_date]
train = train.loc[train['date'] < split_date]

Unnamed: 0_level_0,id,user_id,campaign_id,send_date,is_open,is_click,date
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
14051,42_14051,14051,42,01-09-2017 19:55,0,0,2017-09-01 19:55:00
181789,33_181789,181789,33,24-07-2017 15:15,0,0,2017-07-24 15:15:00
231448,44_231448,231448,44,05-09-2017 11:36,0,0,2017-09-05 11:36:00
185580,29_185580,185580,29,01-07-2017 18:01,0,0,2017-07-01 18:01:00
177808,42_177808,177808,42,01-09-2017 20:13,0,0,2017-09-01 20:13:00


In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.index = train['user_id']
test.index = test['user_id']

# segment 1 - users common to train and test
train_test_common = train[train['user_id'].isin(test['user_id'].unique())]
test_train_common = test[test['user_id'].isin(train['user_id'].unique())]

# segment 1 - all users common to train and test
# before splitting into train and test - create click_through_rate encoded variable
# use segment_one to create features for test set
segment_one = pd.concat([train_test_common, test_train_common])
segment_one.head()

# Segment One currently contains emails from users that were in both train and test.

### Question: Should segment_one_train contain all train data, or only train data from users that are common to train and test (in segment 1).

In [363]:
segment_one_train = segment_one.loc[segment_one['date'] < split_date]
segment_one_train.shape

(681722, 7)

In [364]:
segment_one.shape

(906938, 7)

We'll use segment_one to create features for test data.  We'll generate all features, then extract only the test data.  After, we'll segment the train data and generate features for the train set based only on the train data.

In [369]:
segment_one['no_of_emails'] = segment_one.groupby('user_id').size()
segment_one['cumulative_count'] = segment_one.groupby('user_id').cumcount()
segment_one['click_through_rate'] = segment_one.groupby('user_id')['is_click'].sum() / segment_one.groupby('user_id').size()

def sorting(a):
    avg = np.mean(a)
    if avg > 0:
        return 1
    else:
        return 0

segment_one['has_opened'] = segment_one.groupby('user_id')['is_open'].agg(sorting)
segment_one['has_clicked'] = segment_one.groupby('user_id')['is_click'].agg(sorting)

# returns timestamp object day of week and hour of day
def weekday(a):
    return a.dayofweek

def hourofday(a):
    return a.hour

segment_one['day_of_week']= segment_one['date'].apply(weekday)
segment_one['hour_of_day']= segment_one['date'].apply(hourofday)
segment_one.drop('send_date', axis = 1, inplace = True)

segment_one.fillna(0, inplace = True)
segment_one.reset_index(drop = True, inplace=True)

def encode_feature(df, feature_name, column_to_agg, num_buckets = 30):
    
    # create name for new feture, and copy the dataframe
    bucket = feature_name + '_bucket_interval'
    bucket_val = feature_name + '_' + column_to_agg + '_bucket_avg_val'
    df_copy = df.copy()
        
    # create a new feature that contains the interval to which the observation belongs
    df_copy[bucket] = pd.qcut(df[feature_name],num_buckets, duplicates = 'drop') 
        
    # group the intervals together and calculate the mean price for each interval/bucket
    df_new = df_copy.groupby(df_copy[bucket], as_index = False).agg({column_to_agg:'mean'})
    df_new = pd.DataFrame(df_new)
    
    # specify the column names, and create a row that contains the number of the bucket for plotting
    df_new.rename(columns = {column_to_agg:bucket_val}, inplace = True)
    df_new['bucket_number'] = df_new.index + 1
    df_new.head()
    
    #join df and df_new on the new bucket_interval feature
    df_join = pd.merge(df_new, df_copy, on=bucket)
    df_join.drop([bucket, 'bucket_number'], axis = 1, inplace = True)
    return df_join


# Create encoded features
segment_one = encode_feature(segment_one, 'no_of_emails', 'is_click')
segment_one = encode_feature(segment_one,'no_of_emails', 'is_open')

# merge campagin data with emails
segment_one = campaign_data.merge(segment_one, on = 'campaign_id')
segment_one['train'] = 1

# Calculates the count received of each type of campaign - essentially whether or not they received that campaign
pivot_df = pd.pivot_table(segment_one, values="train", index="user_id", columns="campaign_id", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id'] + ['campaign_' + str(col) for col in range(29,55)]
segment_one = segment_one.merge(pivot_df, on = 'user_id')

# Calculates the count received of each type of communication
pivot_df = pd.pivot_table(segment_one, values="train", index="user_id", columns="communication_type", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id', 'conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count']
segment_one = segment_one.merge(pivot_df, on = 'user_id')

# calculate percentage of communcation type received for each user
segment_one['conference_percent']=segment_one['conference_count'] / segment_one['no_of_emails']
segment_one['corporate_percent']=segment_one['corporate_count'] / segment_one['no_of_emails']
segment_one['hackathon_percent']=segment_one['hackathon_count'] / segment_one['no_of_emails']
segment_one['newsletter_percent']=segment_one['newsletter_count'] / segment_one['no_of_emails']
segment_one['others_percent']=segment_one['others_count'] / segment_one['no_of_emails']
segment_one['upcoming_events_percent']=segment_one['upcoming_events_count'] / segment_one['no_of_emails']
segment_one['webinar_percent']=segment_one['webinar_count'] / segment_one['no_of_emails']

# drop unneccesary columns
segment_one.drop(['conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count'], axis = 1, inplace = True)

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = segment_one['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
segment_one = pd.concat((segment_one,asd1), axis = 1) 


Defaulting to column but this will raise an ambiguity error in a future version
  
Defaulting to column but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [370]:
segment_one.drop('train', axis = 1, inplace = True)

In [371]:
segment_one_test = segment_one.loc[segment_one['date'] >= split_date]
segment_one_test.drop('date', axis = 1, inplace = True)
segment_one_test.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url,no_of_emails_is_open_bucket_avg_val,no_of_emails_is_click_bucket_avg_val,id,user_id,is_open,is_click,no_of_emails,cumulative_count,click_through_rate,has_opened,has_clicked,day_of_week,hour_of_day,campaign_29,campaign_30,campaign_31,campaign_32,campaign_33,campaign_34,campaign_35,campaign_36,campaign_37,campaign_38,campaign_39,campaign_40,campaign_41,campaign_42,campaign_43,campaign_44,campaign_45,campaign_46,campaign_47,campaign_48,campaign_49,campaign_50,campaign_51,campaign_52,campaign_53,campaign_54,conference_percent,corporate_percent,hackathon_percent,newsletter_percent,others_percent,upcoming_events_percent,webinar_percent,000,2017,2018,50,action,ahead,allen,anand,and,announcing,articles,artificial,at,attend,av,before,bhu,big,biggest,bird,booz,borne,boss,business,by,can,ceo,chance,chat,chief,churn,click,codefest,coming,competitions,conference,convince,cross,data,datafest,datahack,day,days,delhi,dhs2017,dj,dr,early,emerging,ends,exciting,expert,expires,fireside,for,former,go,gramener,grow,hackathon,hackathons,hamilton,iit,in,india,innovate,inr,intelligence,is,job,join,july,just,keynote,kirk,lacs,largest,last,learning,live,look,machine,meetups,month,more,much,mumbai,ncr,new,newsletter,non,november,now,october,of,offer,on,opportunities,participate,passes,patil,peek,prediction,prizes,programmers,reasons,register,roles,save,science,scientist,sell,september,set,should,sneak,spot,stage,starts,summit,the,through,to,today,tonight,updates,upto,us,visualizing,watch,way,webinar,webinars,why,win,with,world,worth,you,your
3,54,Newsletter,63,58,8,4,"December Newsletter\r\n \r\nDear AVians,\r\n \...","[November Updates] - Announcing DataFest 2018,...",http://r.newsletters.analyticsvidhya.com/7vzmm...,0.09954,0.018054,54_159970,159970,1,0,4,3,0.25,1,1,4,20,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.33401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,54,Newsletter,63,58,8,4,"December Newsletter\r\n \r\nDear AVians,\r\n \...","[November Updates] - Announcing DataFest 2018,...",http://r.newsletters.analyticsvidhya.com/7vzmm...,0.09954,0.018054,54_78440,78440,0,0,4,3,0.0,0,0,4,20,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0.25,0.0,0.0,0.5,0.0,0.25,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.33401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,53,Conference,104,100,13,1,3 DAYS | 30 TALKS | 10 HACK SESSIONS | 6 WORKS...,[Register Now] Just 2 days to go for India's b...,http://r.newsletters.analyticsvidhya.com/7vv5g...,0.09954,0.018054,53_100277,100277,0,0,4,3,0.0,1,0,0,22,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307397,0.0,0.0,0.0,0.0,0.0,0.0,0.244538,0.0,0.0,0.0,0.0,0.0,0.0,0.307397,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241472,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.240055,0.0,0.261687,0.0,0.0,0.0,0.0,0.0,0.0,0.300778,0.24329,0.0,0.0,0.244538,0.0,0.0,0.0,0.0,0.261687,0.0,0.0,0.0,0.0,0.0,0.211448,0.0,0.0,0.211448,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.300778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,54,Newsletter,63,58,8,4,"December Newsletter\r\n \r\nDear AVians,\r\n \...","[November Updates] - Announcing DataFest 2018,...",http://r.newsletters.analyticsvidhya.com/7vzmm...,0.09954,0.018054,54_100277,100277,1,0,4,2,0.0,1,0,4,20,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.33401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,54,Newsletter,63,58,8,4,"December Newsletter\r\n \r\nDear AVians,\r\n \...","[November Updates] - Announcing DataFest 2018,...",http://r.newsletters.analyticsvidhya.com/7vzmm...,0.09954,0.018054,54_71972,71972,0,0,4,3,0.0,0,0,4,20,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0,0.0,0.0,0.75,0.0,0.25,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.33401,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


We have generated the features for segment_one_test using both the train and test data.  Now we will generate the same features for segment_one_train, only using segment_one_train data. 

In [372]:
segment_one_train['no_of_emails'] = segment_one_train.groupby('user_id').size()
segment_one_train['cumulative_count'] = segment_one_train.groupby('user_id').cumcount()
segment_one_train['click_through_rate'] = segment_one_train.groupby('user_id')['is_click'].sum() / segment_one.groupby('user_id').size()
segment_one_train['has_opened'] = segment_one_train.groupby('user_id')['is_open'].agg(sorting)
segment_one_train['has_clicked'] = segment_one_train.groupby('user_id')['is_click'].agg(sorting)
segment_one_train['day_of_week']= segment_one_train['date'].apply(weekday)
segment_one_train['hour_of_day']= segment_one_train['date'].apply(hourofday)
segment_one_train.drop(['date','send_date'], axis = 1, inplace = True)
segment_one_train.fillna(0, inplace = True)
segment_one_train.reset_index(drop = True, inplace=True)

# Create encoded features
segment_one_train = encode_feature(segment_one_train, 'no_of_emails', 'is_click')
segment_one_train = encode_feature(segment_one_train,'no_of_emails', 'is_open')

# merge campagin data with emails
segment_one_train = campaign_data.merge(segment_one_train, on = 'campaign_id')
segment_one_train['train'] = 1

# Calculates the count received of each type of campaign - essentially whether or not they received that campaign
pivot_df = pd.pivot_table(segment_one_train, values="train", index="user_id", columns="campaign_id", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id'] + ['campaign_' + str(col) for col in range(29,52)]
segment_one_train = segment_one_train.merge(pivot_df, on = 'user_id')

# Calculates the count received of each type of communication
pivot_df = pd.pivot_table(segment_one_train, values="train", index="user_id", columns="communication_type", aggfunc="count", fill_value=0).reset_index()
pivot_df.columns = ['user_id', 'conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count']
segment_one_train = segment_one_train.merge(pivot_df, on = 'user_id')

# calculate percentage of communcation type received for each user
segment_one_train['conference_percent']=segment_one_train['conference_count'] / segment_one_train['no_of_emails']
segment_one_train['corporate_percent']=segment_one_train['corporate_count'] / segment_one_train['no_of_emails']
segment_one_train['hackathon_percent']=segment_one_train['hackathon_count'] / segment_one_train['no_of_emails']
segment_one_train['newsletter_percent']=segment_one_train['newsletter_count'] / segment_one_train['no_of_emails']
segment_one_train['others_percent']=segment_one_train['others_count'] / segment_one_train['no_of_emails']
segment_one_train['upcoming_events_percent']=segment_one_train['upcoming_events_count'] / segment_one_train['no_of_emails']
segment_one_train['webinar_percent']=segment_one_train['webinar_count'] / segment_one_train['no_of_emails']

# drop unneccesary columns
segment_one_train.drop(['conference_count', 'corporate_count','hackathon_count','newsletter_count','others_count','upcoming_events_count','webinar_count'], axis = 1, inplace = True)

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = segment_one_train['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
segment_one_train = pd.concat((segment_one_train,asd1), axis = 1) 

Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
Defaulting to column but this will raise an ambiguity error in a future version
  
Defaulting to column but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until
Defaulting to column but this will raise an ambiguity error in a future version
  after removing the cwd from sys.path.
Defaulting to column but this will raise an ambiguity error in a future version
  """


In [373]:
segment_one_train.drop('train', axis = 1, inplace = True)

In [374]:
segment_one_train.head()

Unnamed: 0,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,email_body,subject,email_url,no_of_emails_is_open_bucket_avg_val,no_of_emails_is_click_bucket_avg_val,id,user_id,is_open,is_click,no_of_emails,cumulative_count,click_through_rate,has_opened,has_clicked,day_of_week,hour_of_day,campaign_29,campaign_30,campaign_31,campaign_32,campaign_33,campaign_34,campaign_35,campaign_36,campaign_37,campaign_38,campaign_39,campaign_40,campaign_41,campaign_42,campaign_43,campaign_44,campaign_45,campaign_46,campaign_47,campaign_48,campaign_49,campaign_50,campaign_51,conference_percent,corporate_percent,hackathon_percent,newsletter_percent,others_percent,upcoming_events_percent,webinar_percent,000,2017,50,ahead,allen,anand,and,artificial,at,attend,av,before,bhu,big,bird,booz,borne,boss,business,by,can,ceo,chance,chat,chief,churn,click,codefest,coming,competitions,conference,convince,cross,data,datahack,day,days,delhi,dhs2017,dj,dr,early,emerging,ends,exciting,expert,expires,fireside,former,go,gramener,grow,hackathon,hackathons,hamilton,iit,in,india,innovate,inr,intelligence,join,july,just,keynote,kirk,lacs,largest,last,learning,live,look,machine,meetups,month,more,much,ncr,new,newsletter,non,now,october,of,offer,on,participate,passes,patil,peek,prediction,prizes,programmers,reasons,register,roles,save,science,scientist,sell,september,should,sneak,spot,starts,summit,the,through,to,today,tonight,upto,us,visualizing,watch,way,webinar,webinars,why,win,with,world,worth,you,your
0,29,Newsletter,67,61,12,3,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...,0.091663,0.016044,29_159970,159970,0,0,3,0,0.25,1,1,5,18,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238928,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.364583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0
1,32,Conference,24,19,7,1,\r\n \r\nHi ?\r\n \r\nBefore I dive into why y...,A.I. & Machine Learning: 5 reasons why you sho...,http://r.newsletters.analyticsvidhya.com/7uthl...,0.091663,0.016044,32_159970,159970,1,1,3,2,0.25,1,1,2,12,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.196333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.175512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291255,0.0,0.0,0.291255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382148,0.0,0.0,0.0,0.175512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.382148,0.0,0.0,0.0,0.0,0.382148,0.0
2,36,Conference,13,11,2,1,Announcing Dr. Kirk Borne as Keynote Speaker\r...,"Dr Kirk Borne of Booz Allen Hamilton, to keyno...",http://r.newsletters.analyticsvidhya.com/7uxpa...,0.091663,0.016044,36_159970,159970,0,0,3,1,0.25,1,1,3,17,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.666667,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.165305,0.0,0.0,0.316539,0.0,0.0,0.0,0.24896,0.0,0.0,0.0,0.0,0.0,0.0,0.316539,0.316539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147775,0.0,0.0,0.0,0.0,0.0,0.316539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.316539,0.316539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.310818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.147775,0.0,0.0,0.262742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,29,Newsletter,67,61,12,3,"Dear AVians,\r\n \r\nWe are shaping up a super...",Sneak Peek: A look at the emerging data scienc...,http://r.newsletters.analyticsvidhya.com/7um44...,0.091663,0.016044,29_200791,200791,0,0,3,0,0.0,0,0,6,18,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.285325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238928,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0,0.364583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369033,0.0,0.0,0.0
4,30,Upcoming Events,18,14,7,1,"Dear AVians,\r\n \r\nAre your eager to know wh...",[July] Data Science Expert Meetups & Competiti...,http://r.newsletters.analyticsvidhya.com/7up0e...,0.091663,0.016044,30_200791,200791,0,0,3,1,0.0,0,0,2,14,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.333333,0.333333,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318457,0.408495,0.0,0.0,0.0,0.242403,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.26548,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.265311,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.311346


## Segment 2 - Users only found in test data (new customers)
#### segment_two_train 
We will generate features only using the training data.  We will not include any encoded features, as we will not have any prior data on our users in the test set.
#### segment_two_test
We will create features using both the train and test data, excluding all encoded features.

In [375]:
segment_two_train = train

In [376]:
segment_two_train['day_of_week']= segment_two_train['date'].apply(weekday)
segment_two_train['hour_of_day']= segment_two_train['date'].apply(hourofday)
segment_two_train.drop(['date','send_date'], axis = 1, inplace = True)
segment_two_train.fillna(0, inplace = True)
segment_two_train.reset_index(drop = True, inplace=True)

# merge campagin data with emails
segment_two_train = campaign_data.merge(segment_two_train, on = 'campaign_id')

#NLP
vectorizer = CountVectorizer(min_df = 1)
corpus = segment_two_train['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
segment_two_train = pd.concat((segment_two_train,asd1), axis = 1) 

In [377]:
segment_two_train.shape

(761657, 140)

In [379]:
segment_two_test = test.drop(train_test_common['user_id'].unique())
segment_two_test.shape

(36318, 7)

In [380]:
segment_two_test.head()

Unnamed: 0_level_0,id,user_id,campaign_id,send_date,is_open,is_click,date
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
231024,54_231024,231024,54,01-12-2017 20:15,0,0,2017-12-01 20:15:00
34491,53_34491,34491,53,06-11-2017 22:38,0,0,2017-11-06 22:38:00
117634,53_117634,117634,53,06-11-2017 22:50,0,0,2017-11-06 22:50:00
154567,54_154567,154567,54,01-12-2017 20:10,0,0,2017-12-01 20:10:00
122301,54_122301,122301,54,01-12-2017 20:19,0,0,2017-12-01 20:19:00


In [381]:
segment_two_test['day_of_week']= segment_two_test['date'].apply(weekday)
segment_two_test['hour_of_day']= segment_two_test['date'].apply(hourofday)
segment_two_test.drop(['date','send_date'], axis = 1, inplace = True)
segment_two_test.fillna(0, inplace = True)
segment_two_test.reset_index(drop = True, inplace=True)

# merge campagin data with emails
segment_two_test = campaign_data.merge(segment_two_test, on = 'campaign_id')

# NLP
vectorizer = CountVectorizer(min_df = 1)

corpus = segment_two_test['subject']
x = vectorizer.fit_transform(corpus)
x.toarray()
matrix1 = x.toarray()
vectorizer.vocabulary_.get('harvest')

from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(matrix1)

asd = tfidf.toarray()
asd1 = pd.DataFrame(asd) 
features = vectorizer.get_feature_names() 
asd1.columns = features
segment_two_test = pd.concat((segment_two_test,asd1), axis = 1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

In [382]:
segment_two_test.shape

(36318, 49)

In [392]:
segment_two_train.head(1)

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,id,user_id,is_open,is_click,day_of_week,hour_of_day,000,2017,50,ahead,allen,anand,and,artificial,at,attend,av,before,bhu,big,bird,booz,borne,boss,business,by,can,ceo,chance,chat,chief,churn,click,codefest,coming,competitions,conference,convince,cross,data,datahack,day,days,delhi,dhs2017,dj,dr,early,emerging,ends,exciting,expert,expires,fireside,former,go,gramener,grow,hackathon,hackathons,hamilton,iit,in,india,innovate,inr,intelligence,join,july,just,keynote,kirk,lacs,largest,last,learning,live,look,machine,meetups,month,more,much,ncr,new,newsletter,non,now,october,of,offer,on,participate,passes,patil,peek,prediction,prizes,programmers,reasons,register,roles,save,science,scientist,sell,september,should,sneak,spot,starts,summit,the,through,to,today,tonight,upto,us,visualizing,watch,way,webinar,webinars,why,win,with,world,worth,you,your
0,29,67,61,12,3,29_185580,185580,0,0,5,18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.284982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.218211,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.238297,0.0,0.0,0.0,0.0,0.369229,0.0,0.0,0.0,0.364326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.369229,0.0,0.0,0.0


In [393]:
segment_two_test.head(1)

Unnamed: 0,campaign_id,total_links,no_of_internal_links,no_of_images,no_of_sections,id,user_id,is_open,is_click,day_of_week,hour_of_day,2017,and,artificial,datahack,days,go,hackathons,in,india,intelligence,just,learning,machine,new,newsletter,now,register,summit,to
0,52,67,62,10,4,52_189338,189338,0,0,3,12,0.274385,0.274385,0.0,0.274385,0.0,0.0,0.274385,0.0,0.0,0.0,0.0,0.0,0.0,0.274385,0.274385,0.0,0.0,0.274385,0.0


In [384]:
to_drop = ['communication_type','email_body','subject','email_url']
segment_one_test.drop(to_drop, axis = 1, inplace = True)
segment_one_train.drop(to_drop, axis = 1, inplace = True)
segment_two_test.drop(to_drop, axis = 1, inplace = True)
segment_two_train.drop(to_drop, axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [394]:
a = list(set(segment_one_test.columns) - set(segment_one_train.columns))
segment_one_test.drop(a, axis = 1 , inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [395]:
a = list(set(segment_two_test.columns) - set(segment_two_train.columns))
segment_two_test.drop(a, axis = 1 , inplace = True)
a = list(set(segment_two_train.columns) - set(segment_two_test.columns))
segment_two_train.drop(a, axis = 1 , inplace = True)

In [396]:
segment_one_train.shape

(681722, 173)

In [397]:
segment_one_test.shape

(225216, 173)

In [398]:
segment_two_train.shape

(761657, 30)

In [399]:
segment_two_test.shape

(36318, 30)

In [400]:
segment_one_test.to_csv("segment_one_test.csv")
segment_one_train.to_csv("segment_one_train.csv")
segment_two_test.to_csv("segment_two_test.csv")
segment_two_train.to_csv("segment_two_train.csv")