In [1]:
'''
Script to perform feature engineering on data set from the data we process using Spark
author: Yue Wen
'''
# import necessary library
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

# read data
data = pd.read_csv("data.csv")
del data["Unnamed: 0"]

In [24]:
# build label

#use last_n_cnt to build label, the cnt of the songs listened at the last 8 days
data["churn"] = np.where(data.last_n_cnt>0,0,1)
# then delete the feature
del data["last_n_cnt"]
data.churn.describe()

count    610629.000000
mean          0.675718
std           0.468106
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           1.000000
Name: churn, dtype: float64

In [33]:
# song type entropy: Calculate song type entropy

def entropy(p0,p1,p2):
    """
    calculated the entropy of multivariate distribution with three variables 
    
    Params
    ------
    p0, p1, p2: float
    
    Returns
    -------
    float
    """
    not_zero = [i for i in (p0,p1,p2) if i > 0.0]
    result = -sum(np.log(np.array(not_zero))*not_zero)
    return result

#calculate percentage of song for each type
data["type0_perc"] = data["type0_cnt"]/(data["type0_cnt"]+data["type1_cnt"]+data["type2_cnt"])
data["type1_perc"] = data["type1_cnt"]/(data["type0_cnt"]+data["type1_cnt"]+data["type2_cnt"])
data["type2_perc"] = data["type2_cnt"]/(data["type0_cnt"]+data["type1_cnt"]+data["type2_cnt"])
# delete used features
del data["type0_cnt"]
del data["type1_cnt"]
del data["type2_cnt"]

data["song_type_entropy"] = data.apply(lambda x: entropy(x.type0_perc,x.type1_perc,x.type2_perc,),1)

In [40]:
# build other fatures based on windoes count
data["period_3_no_behavior"] = np.where(data.p3_cnt==0,1,0)
data["period_23_no_behavior"] = np.where((data.p3_cnt==0)&(data.p2_cnt==0),1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [42]:
def change_perc(prev,cur):
    '''
    calculate change percentage from prevrious number to curnumber
    
    Params
    ------
    prev: float
    cur: float
    
    Returns
    -------
    float
    '''
    if (prev == 0 and cur==0):
        return 0
    if (prev == 0 and cur >0):
        return 1
    return (cur -prev )/prev


data["2to3_change_perc"] = data.apply(
                                        lambda x: change_perc(x.p2_cnt,x.p3_cnt),1)

data["1to2_change_perc"] = data.apply(
                                     lambda x: change_perc(x.p1_cnt,x.p2_cnt),1)

# encode categorical type
data["device"] = np.where(data["device"] == ' ar',1,0)

In [50]:
# move the position of churn to last column
churn = data["churn"]
del data["churn"]
data["churn"] = churn
# save the final data to csv file
data.to_csv("final_data.csv")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
