In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
import math

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
online_basic = pd.read_pickle("../new_data/variables/online_basic.pkl")
online_time = pd.read_pickle("../new_data/variables/online_time.pkl")
online = pd.read_pickle("../new_data/variables/online.pkl")
online_group = pd.read_pickle("../new_data/variables/online_group.pkl")
offline_basic = pd.read_pickle("../new_data/variables/offline_basic.pkl")
offline_time = pd.read_pickle("../new_data/variables/offline_time.pkl")
offline_group = pd.read_pickle("../new_data/variables/offline_group.pkl")

In [None]:
def scaling(df):
    scaler = StandardScaler()
    df_scaler = scaler.fit(df)
    df_scale = df_scaler.transform(df)
    df_scale = pd.DataFrame(df_scale)
    df_scale.columns = df.columns
    df_scale.index = df.index
    return(df_scale)

def covariance(df):
    df_cov = pd.DataFrame(np.cov(df,rowvar=0))
    df_cov.columns = df.columns
    df_cov.index = df.columns
    return(df_cov)

def factor_analysis(df_cov):
    U,S,V = np.linalg.svd(df_cov,full_matrices=True)
    U = pd.DataFrame(U)
    V = pd.DataFrame(V)
    return(U,S,V)

def factor_score(df,df_cov,loading,data):
    error = pd.DataFrame(df_cov - loading @ loading.T)
    term1 = np.dot(np.array(loading.T),np.linalg.inv(error))
    term1 = np.dot(term1,np.array(loading))
    term1 = np.linalg.inv(term1)
    term2 = np.dot(np.array(loading.T),np.linalg.inv(error))
    term = np.dot(term1,term2)
    score = [term@np.array(df.loc[i]) for i in df.index]
    score = pd.DataFrame(score)
    score.index = df.index
    score.columns = [data+str(i+1) for i in range(len(score.columns))]
    return(score)

# Online Data

In [None]:
online_var = pd.merge(online_basic, online_time, on='clnt_id', how='outer')
online_var = pd.merge(online_var, online, on='clnt_id', how='left')
online_var = online_var.fillna(0)
online_var = scaling(online_var)

In [None]:
online_cov = covariance(online_var)
U,S,V = factor_analysis(online_cov)

In [None]:
plt.plot(S)

In [None]:
S[0:5]/S.sum()

In [None]:
loading = pd.DataFrame(np.dot(U.loc[:,0:4],np.diag([i**0.5 for i in S[0:5]])))
loading.index = online_var.columns
loading.columns = ['online'+str(i+1) for i in range(len(loading.columns))]
round(loading,2)

In [None]:
com = loading['online1']**2 + loading['online2']**2 + loading['online3']**2 + loading['online4']**2 + loading['online5']**2

In [None]:
pd.DataFrame(round(com,2))

In [None]:
error = pd.DataFrame(online_cov - loading @ loading.T)
plt.figure(figsize=(10,10))
sns.heatmap(data = error, linewidths=.5, cmap='Blues')

In [None]:
online_score = factor_score(online_var,online_cov,loading,'online')

In [None]:
(np.argsort(np.argsort(online_score))==4).sum()

In [None]:
online_score.to_pickle("../new_data/factor_score/online_score.pkl")

In [None]:
online_group.columns = ['group1','group2','group3','group4']

In [None]:
online_clac = pd.merge(online_score,online_group,on='clnt_id',how='outer')

In [None]:
a = online_clac[['group1','group2','group3','group4']].sum()
a / a.sum()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = online_clac.corr(), linewidths=.5, annot=True, cmap='Blues')

# Offline Data

In [None]:
offline_var = pd.merge(offline_basic, offline_time, on='clnt_id', how='outer')
offline_var = scaling(offline_var)
offline_cov = covariance(offline_var)
U,S,V = factor_analysis(offline_cov)

In [None]:
plt.plot(S)

In [None]:
S[0:3]/S.sum()

In [None]:
loading = pd.DataFrame(np.dot(U.loc[:,0:2],np.diag([i**0.5 for i in S[0:3]])))
loading.index = offline_var.columns
loading.columns = ['offline'+str(i+1) for i in range(len(loading.columns))]
round(loading,2)

In [None]:
com = loading['offline1']**2 + loading['offline2']**2 + loading['offline3']**2

In [None]:
pd.DataFrame(round(com,2))

In [None]:
error = pd.DataFrame(offline_cov - loading @ loading.T)
plt.figure(figsize=(7,7))
sns.heatmap(data = error, linewidths=.5, cmap='Blues')

In [None]:
offline_score = factor_score(offline_var,offline_cov,loading,'offline')

In [None]:
offline_score.head(20)

In [None]:
(np.argsort(np.argsort(offline_score))==2).sum()

In [None]:
offline_score.to_pickle("../new_data/factor_score/offline_score.pkl")

In [None]:
offline_group.columns = ['group1','group2','group3','group4']

In [None]:
offline_clac = pd.merge(offline_score,offline_group,on='clnt_id',how='outer')

In [None]:
a = offline_clac[['group1','group2','group3','group4']].sum()
a / a.sum()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = offline_clac.corr(), linewidths=.5, annot=True,cmap='Blues')

In [None]:
score = pd.merge(online_score,offline_score,on='clnt_id',how='outer')

In [None]:
score.head(20)

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(data = score.corr(), annot=True,linewidths=.5, cmap='Blues')