In [1]:

import seaborn as sns
import csv
import sqlite3
import math
import matplotlib.pyplot as plt
import pickle as pkl
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime
from sklearn.metrics import silhouette_score, silhouette_samples
import matplotlib.cm as cm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import bw_class
import sy_class
import sy_database

In [2]:
train_df = pd.read_csv("./train.csv")
val_df = pd.read_csv("./val.csv")
test_df = pd.read_csv("./test.csv")

In [3]:
# 추가 변수 생성된 df
def process_and_merge_data(df_name, bw_class):
    # Preprocessing with bw_class.bw_preprocessing
    pre = bw_class.bw_preprocessing(df_name)
    pre.apply_my_function()
    pre_df = pre.return_dataframe()

    # Processing with RFMProcessor
    processor = bw_class.RFMProcessor(df_name)
    rfm_without_outliers, rfm_outliers, rfm_without_outliers_log, X_scaled = processor.process_data()
    processor.fit_clustering(X_scaled, n_clusters=4)
    new_data_predictions = processor.predict(df_name)

    # Mapping cluster data
    cluster_data = bw_class.mapping_cluster(new_data_predictions)
    cluster = cluster_data[['고객ID', '고객분류']]
    
    # Merging processed data
    final = pre_df.merge(cluster, on='고객ID', how='left')

    return final


In [4]:
train = process_and_merge_data(train_df, bw_class) 
val = process_and_merge_data(val_df, bw_class) 
test = process_and_merge_data(test_df, bw_class)

In [5]:
# 전처리한 df를 db에 저장하기
db_manager = sy_database.DatabaseManager()

# Create new database
db_manager.create_database('./sy_db.db')

# sy_db에 table 만들기
db_manager.create_new_table(train, 'train')
db_manager.create_new_table(val, 'val')
db_manager.create_new_table(test, 'test')

# # train dataframe 만들기
# train = db_manager.making_dataframe("train")

Database created at: ./sy_db.db
Database path set to: ./sy_db.db


Existing table 'train' dropped, and new table is created


Existing table 'val' dropped, and new table is created


Existing table 'test' dropped, and new table is created


In [6]:
# 코호트 분석
cohort_analysis = sy_class.CohortAnalysis(train)
cohort = cohort_analysis.calculate_cohort()
retention_matrix = cohort_analysis.calculate_retention_rate(cohort)

# category df를 sy_db에 저장
db_manager.create_new_table(cohort, 'cohort')




Existing table 'cohort' dropped, and new table is created


In [7]:
# 카테고리별 재구매 그래프
CategoryAnalysis = sy_class.CustomerCategoryAnalysis(train)
CategoryAnalysis.calculate_repurchase_periods()
category = CategoryAnalysis.create_category_dataframe()

# category df를 sy_db에 저장
db_manager.create_new_table(category, 'category')




Existing table 'category' dropped, and new table is created


In [8]:
CategoryAnalysis.visualize_repurchase_periods('평균 재구매 주기(일)', category)

In [9]:
CategoryAnalysis.visualize_repurchase_periods('재구매율', category)

### 재구매 여부 예측

In [10]:
# 분석에 사용할 컬럼들로 이루어진 repurchase df를 db에 저장
columns = ['제품카테고리', '평균금액', '배송료', '쿠폰상태', '성별', '고객지역', '가입기간',  '할인율', '마케팅비용', '고객소비액', '매출', '재방문여부', '고객분류']
repurchase = train[columns]
db_manager.create_new_table(repurchase, 'repurchase')



Existing table 'repurchase' dropped, and new table is created


In [11]:
# 로지스틱 회귀
model_instance = sy_class.RebuyPredictionModel(train, target_column='재방문여부')
train_data, test_data = model_instance.train_test_split()
db_manager.create_new_table(test_data, "predict")

model_pipeline = model_instance.run_pipeline(train_data)
model_instance.save_model_and_transformation()

model_instance.evaluate_model(test_data)
feature_names = model_instance.get_feature_names()
pred = model_instance.inference(test_data) 
model_instance.save_predictions_to_db(test_data, pred)



Existing table 'predict' dropped, and new table is created


Accuracy for test set: 0.7972861533848115
Confusion matrix for test set:
Predicted     0     1
Actual               
0          1587  1835
1           869  9048
Predictions saved to predict table in sy_db.db



lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



In [12]:
model_pipeline