# 여행 상품 신청 여부 -> 지도학습모델을 사용해서 분류 및 예측을 진행할 것 

# ML 01 - Logistic Regression 
 - 분류문제를 해결 할 수 있도록 선형회귀 모델에 로지스틱 함수 추가

# 0 데이터분석에 사용할 라이브러리 로드 

In [1]:
# 핑크색 warning 안내문 방지 
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 데이터 분석에 사용할 라이브러리 로드 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

## cf) 폰트 설정

In [3]:
def get_font_family():
    """
    시스템 환경에 따른 기본 폰트명을 반환하는 함수
    """
    import platform
    system_name = platform.system()

    if system_name == "Darwin" :
        font_family = "AppleGothic"
    elif system_name == "Windows":
        font_family = "Malgun Gothic"
    else:
        # Linux(Colab)
        !apt-get install fonts-nanum -qq  > /dev/null
        !fc-cache -fv

        import matplotlib as mpl
        mpl.font_manager._rebuild()
        findfont = mpl.font_manager.fontManager.findfont
        mpl.font_manager.findfont = findfont
        mpl.backends.backend_agg.findfont = findfont
        
        font_family = "NanumBarunGothic"
    return font_family

plt.rc("font", family=get_font_family())
plt.rc("axes", unicode_minus=False)

# 1 데이터 셋 로드 

In [4]:
import glob
path = glob.glob('data/*.csv')
path

['data\\sample_submission.csv',
 'data\\test.csv',
 'data\\test_v1.csv',
 'data\\train.csv',
 'data\\train_v1.csv']

## 1.1 학습데이터 로드

In [62]:
pd.read_csv('data/train_v1.csv', encoding = 'utf-8')

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome,ProdTaken
0,1,28.0,0,1,10.0,3,2,3,4.0,0,3.0,1,3.0,0,1,0,1.0,1,20384.0,0
1,2,34.0,1,3,14.0,3,1,2,4.0,1,4.0,2,1.0,1,5,1,0.0,2,19599.0,1
2,3,45.0,0,1,14.0,2,2,2,3.0,1,4.0,1,2.0,0,4,1,0.0,2,22295.0,0
3,4,29.0,0,1,7.0,3,2,3,5.0,0,4.0,1,3.0,0,4,0,1.0,1,21274.0,1
4,5,42.0,1,3,6.0,2,2,2,3.0,1,3.0,0,2.0,0,3,1,0.0,2,19907.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1950,1951,28.0,1,1,10.0,3,2,3,5.0,0,3.0,2,2.0,0,1,1,2.0,1,20723.0,0
1951,1952,41.0,1,3,8.0,2,1,3,3.0,4,5.0,0,1.0,0,5,1,1.0,0,31595.0,0
1952,1953,38.0,0,3,28.0,3,1,3,4.0,0,3.0,0,7.0,0,2,1,2.0,1,21651.0,0
1953,1954,28.0,1,3,30.0,3,1,3,5.0,1,3.0,1,3.0,0,1,1,2.0,2,22218.0,0


## 1.2 예측데이터 로드

In [63]:
pd.read_csv('data/test_v1.csv', encoding = 'utf-8')

Unnamed: 0,id,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,1,32.0,0,3,13.0,3,2,2,5.000000,1,3.0,1,1.0,0,2,0,1.0,2,19668.0
1,2,46.0,1,2,11.0,3,2,3,3.701827,1,4.0,1,1.0,1,5,0,1.0,2,20021.0
2,3,37.0,1,3,22.0,3,2,3,4.000000,1,3.0,1,5.0,0,5,1,0.0,2,21334.0
3,4,43.0,1,1,36.0,3,2,3,6.000000,1,3.0,3,6.0,0,3,1,2.0,2,22950.0
4,5,25.0,1,3,7.0,1,1,4,4.000000,0,4.0,3,3.0,1,4,1,3.0,1,21880.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2928,2929,54.0,1,1,6.0,3,1,2,3.000000,4,3.0,2,7.0,0,4,1,1.0,0,32328.0
2929,2930,33.0,1,1,9.0,3,0,4,2.000000,1,3.0,3,2.0,0,3,0,1.0,2,23733.0
2930,2931,33.0,0,1,31.0,2,2,4,4.000000,1,3.0,0,3.0,0,4,1,1.0,2,23987.0
2931,2932,26.0,1,1,9.0,3,2,4,2.000000,0,5.0,3,2.0,0,2,1,3.0,1,22102.0


# 2 머신러닝 - 로지스틱 회귀분석

In [7]:
#로지스틱 회귀분석
from sklearn.linear_model import LogisticRegression

In [8]:
#모델 선언
model = LogisticRegression()

In [9]:
# 분석할 의미가 없는 피쳐 drop
train = x_tn.drop(columns=['id'])
test = test.drop(columns=['id'])

In [66]:
# 학습에 사용할 정보와 예측하고자 하는 정보를 분리
x_train = train.drop(columns=['ProdTaken'])
y_train = train[['ProdTaken']]

In [67]:
# 모델 학습
model.fit(x_train,y_train)

LogisticRegression()

In [68]:
prediction = model.predict(test)

In [69]:
print(prediction[:10])

[0 0 0 0 0 0 0 0 0 0]


In [70]:
path

['data\\sample_submission.csv',
 'data\\test.csv',
 'data\\test_v1.csv',
 'data\\train.csv',
 'data\\train_v1.csv']

In [71]:
sample_submission = pd.read_csv(path[0])
sample_submission

Unnamed: 0,id,ProdTaken
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
2928,2929,0
2929,2930,0
2930,2931,0
2931,2932,0


In [72]:
# 예측된 값을 정답파일과 병합
sample_submission['ProdTaken'] = prediction

In [74]:
# 정답파일 데이터프레임 확인
sample_submission

Unnamed: 0,id,ProdTaken
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
2928,2929,0
2929,2930,0
2930,2931,0
2931,2932,0


In [75]:
sample_submission['ProdTaken'].unique()

array([0, 1], dtype=int64)

In [76]:
sample_submission.to_csv('submission.csv',index = False)