In [2]:
import pandas as pd
import numpy as np
import re
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.font_manager as fm 
matplotlib.rcParams['font.family'] = 'Microsoft Yahei'

In [13]:
df = pd.read_csv('客户信息.csv')

In [14]:
df

Unnamed: 0,ID,出生日期,性别,婚姻状况,学历,用车区域,车辆用途,车辆价值,车辆类别
0,63581743,16-Mar-39,男,未婚,博士,城镇,个人,"$14,230",MPV
1,132761049,21-Jan-56,男,未婚,高中,城镇,单位,"$14,940",MPV
2,921317019,18-Nov-51,男,未婚,本科,城镇,个人,"$21,970",厢式货车
3,727598473,5-Mar-64,女,已婚,高中,城镇,个人,"$4,010",SUV
4,450221861,5-Jun-48,男,已婚,高中以下,城镇,个人,"$15,440",MPV
...,...,...,...,...,...,...,...,...,...
10297,67790126,13-Aug-54,男,已婚,博士,城镇,个人,"$13,270",MPV
10298,61970712,17-Jun-53,男,已婚,硕士,城镇,单位,"$24,490",卡车
10299,849208064,18-Jun-51,女,已婚,高中以下,城镇,个人,"$13,820",SUV
10300,627828331,12-Dec-48,女,已婚,本科,城镇,个人,"$22,550",MPV


In [15]:
for i in range(len(df['车辆价值'])):
    if isinstance(df['车辆价值'][i], str):
        df['车辆价值'][i] = re.sub(r'\D', '', df['车辆价值'][i])

In [16]:
df['车辆价值'] = (df['车辆价值'].astype(int)) * 10

In [17]:
df['车辆价值'] = pd.cut(x=df['车辆价值'], bins=[0, 100000, 200000, 300000, 400000, 1e10],labels=['小于10万', '10-20万', '20-30万',
                            '30-40万', '大于40万'])

In [18]:
df['出生日期'] = df['出生日期'].astype("datetime64[D]")

In [19]:
for i in range(len(df['出生日期'])):
    df['出生日期'][i] = df['出生日期'][i].replace(year=df['出生日期'][i].year-100)

In [20]:
df

Unnamed: 0,ID,出生日期,性别,婚姻状况,学历,用车区域,车辆用途,车辆价值,车辆类别
0,63581743,1939-03-16,男,未婚,博士,城镇,个人,10-20万,MPV
1,132761049,1956-01-21,男,未婚,高中,城镇,单位,10-20万,MPV
2,921317019,1951-11-18,男,未婚,本科,城镇,个人,20-30万,厢式货车
3,727598473,1964-03-05,女,已婚,高中,城镇,个人,小于10万,SUV
4,450221861,1948-06-05,男,已婚,高中以下,城镇,个人,10-20万,MPV
...,...,...,...,...,...,...,...,...,...
10297,67790126,1954-08-13,男,已婚,博士,城镇,个人,10-20万,MPV
10298,61970712,1953-06-17,男,已婚,硕士,城镇,单位,20-30万,卡车
10299,849208064,1951-06-18,女,已婚,高中以下,城镇,个人,10-20万,SUV
10300,627828331,1948-12-12,女,已婚,本科,城镇,个人,20-30万,MPV


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10302 entries, 0 to 10301
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ID      10302 non-null  int64         
 1   出生日期    10302 non-null  datetime64[ns]
 2   性别      10302 non-null  object        
 3   婚姻状况    10302 non-null  object        
 4   学历      10302 non-null  object        
 5   用车区域    10302 non-null  object        
 6   车辆用途    10302 non-null  object        
 7   车辆价值    10302 non-null  category      
 8   车辆类别    10302 non-null  object        
dtypes: category(1), datetime64[ns](1), int64(1), object(6)
memory usage: 654.3+ KB


In [21]:
for col in df:
    print(df[col].unique())

[ 63581743 132761049 921317019 ...  67790126 849208064 680381960]
['1939-03-16T00:00:00.000000000' '1956-01-21T00:00:00.000000000'
 '1951-11-18T00:00:00.000000000' ... '1954-08-13T00:00:00.000000000'
 '1951-06-18T00:00:00.000000000' '1947-02-27T00:00:00.000000000']
['男' '女']
['未婚' '已婚']
['博士' '高中' '本科' '高中以下' '硕士']
['城镇' '农村']
['个人' '单位']
['10-20万', '20-30万', '小于10万', '30-40万', '大于40万']
Categories (5, object): ['小于10万' < '10-20万' < '20-30万' < '30-40万' < '大于40万']
['MPV' '厢式货车' 'SUV' '跑车' '卡车' '轿车']


In [26]:
distanceband_categories = ['小于5千', '5千-1万', '1万-1万5', '1万5-2万', '大于2万']
speedband_categories = ['小于40', '40-59', '60-79', '80-99', '大于100']
timenight_categories = ['日间', '夜间']

n_samples = 10302
distanceband = np.random.choice(distanceband_categories, size=n_samples, p=[0.15, 0.25, 0.3, 0.2, 0.1])
speedband = np.random.choice(speedband_categories, size=n_samples, p=[0.3, 0.35, 0.2, 0.1, 0.05])
timenight = np.random.choice(timenight_categories, size=n_samples, p=[0.65, 0.35])

coefficients = [-0.042463,
                0.277856,
                0.588736,
                0.906591,
                1.561939,
                -0.711360,
                -0.189678,
                0.094603,
                0.310905]

linear_predictor = (
    coefficients[0] +
    coefficients[1] * (distanceband == '5千-1万') +
    coefficients[2] * (distanceband == '1万-1万5') +
    coefficients[3] * (distanceband == '1万5-2万') +
    coefficients[4] * (distanceband == '大于2万') +
    coefficients[5] * (speedband == '小于40') +
    coefficients[6] * (speedband == '40-59') +
    coefficients[7] * (speedband == '60-79') +
    coefficients[8] * (speedband == '80-99')
)

expected_values = np.exp(linear_predictor)
freq = np.random.poisson(expected_values)

coefficients = [7.83214,
                0.46704,
                0.26546,
                0.33356,
                0.49927,
                -1.94865,
                -1.64837,
                -1.23472,
                -0.73830]

linear_predictor = (
    coefficients[0] +
    coefficients[1] * (distanceband == '5千-1万') +
    coefficients[2] * (distanceband == '1万-1万5') +
    coefficients[3] * (distanceband == '1万5-2万') +
    coefficients[4] * (distanceband == '大于2万') +
    coefficients[5] * (speedband == '小于40') +
    coefficients[6] * (speedband == '40-59') +
    coefficients[7] * (speedband == '60-79') +
    coefficients[8] * (speedband == '80-99')
)


expected_values = np.exp(linear_predictor)
sever = np.round(np.random.exponential(expected_values)).astype(int)

data = pd.DataFrame({
    '年平均里程': distanceband,
    '年平均车速': speedband,
    '主要行车时间': timenight,
    '出险频率': freq,
    '出险强度': sever
})

In [27]:
data

Unnamed: 0,年平均里程,年平均车速,主要行车时间,出险频率,出险强度
0,5千-1万,60-79,日间,1,848
1,小于5千,60-79,日间,0,182
2,1万-1万5,60-79,夜间,0,1915
3,1万5-2万,80-99,夜间,5,681
4,小于5千,小于40,日间,0,63
...,...,...,...,...,...
10297,1万5-2万,40-59,日间,3,81
10298,1万-1万5,60-79,日间,3,537
10299,1万-1万5,小于40,日间,0,162
10300,1万-1万5,80-99,夜间,1,8776


In [28]:
frame = [df, data]
df1 = pd.concat(frame, axis=1)

In [29]:
df1

Unnamed: 0,ID,出生日期,性别,婚姻状况,学历,用车区域,车辆用途,车辆价值,车辆类别,年平均里程,年平均车速,主要行车时间,出险频率,出险强度
0,63581743,1939-03-16,男,未婚,博士,城镇,个人,10-20万,MPV,5千-1万,60-79,日间,1,848
1,132761049,1956-01-21,男,未婚,高中,城镇,单位,10-20万,MPV,小于5千,60-79,日间,0,182
2,921317019,1951-11-18,男,未婚,本科,城镇,个人,20-30万,厢式货车,1万-1万5,60-79,夜间,0,1915
3,727598473,1964-03-05,女,已婚,高中,城镇,个人,小于10万,SUV,1万5-2万,80-99,夜间,5,681
4,450221861,1948-06-05,男,已婚,高中以下,城镇,个人,10-20万,MPV,小于5千,小于40,日间,0,63
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10297,67790126,1954-08-13,男,已婚,博士,城镇,个人,10-20万,MPV,1万5-2万,40-59,日间,3,81
10298,61970712,1953-06-17,男,已婚,硕士,城镇,单位,20-30万,卡车,1万-1万5,60-79,日间,3,537
10299,849208064,1951-06-18,女,已婚,高中以下,城镇,个人,10-20万,SUV,1万-1万5,小于40,日间,0,162
10300,627828331,1948-12-12,女,已婚,本科,城镇,个人,20-30万,MPV,1万-1万5,80-99,夜间,1,8776


In [30]:
df1.to_csv("客户信息.csv", index=False, encoding='utf-8')