## データ型の確認をする

In [1]:
import pandas as pd
import numpy as np

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_clipped_outliers_ChildCount.csv'
train_data = pd.read_csv(data_path, low_memory=False, dtype={'MaritalStatus': 'Int64', 'CarCount': 'Int64', 'ChildCount': 'Int64'})

# データ型の確認
print("Data types of each column:\n", train_data.dtypes)

# 数値型の列を取得
numerical_cols = train_data.select_dtypes(include=['int64', 'float64']).columns
print("Numerical columns:\n", numerical_cols)

Data types of each column:
 id                         int64
Age                        int64
TypeofContact             object
CityTier                   int64
DurationOfPitch            int64
Occupation                object
Gender                    object
NumberOfPersonVisiting     int64
NumberOfFollowups          int64
ProductPitched            object
PreferredPropertyStar      int64
NumberOfTrips              int64
Passport                   int64
PitchSatisfactionScore     int64
Designation               object
MonthlyIncome              int64
MaritalStatus              Int64
CarCount                   Int64
ChildCount                 Int64
dtype: object
Numerical columns:
 Index(['id', 'Age', 'CityTier', 'DurationOfPitch', 'NumberOfPersonVisiting',
       'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips',
       'Passport', 'PitchSatisfactionScore', 'MonthlyIncome', 'MaritalStatus',
       'CarCount', 'ChildCount'],
      dtype='object')


## 目的変数を使用せず新しい特徴量を作成する

In [14]:
import pandas as pd
import numpy as np
import os

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_clipped_outliers_ChildCount.csv'
test_data = pd.read_csv(data_path, low_memory=False, 
                         dtype={'MaritalStatus': 'Int64', 'CarCount': 'Int64', 'ChildCount': 'Int64'})

# 1. 家族のサイズを示す変数を作成
# MaritalStatusが1（既婚）なら配偶者を含める
test_data['FamilySize'] = 1 + test_data['ChildCount'] + test_data['MaritalStatus'].apply(lambda x: 1 if x == 1 else 0)

# 2. Age と MonthlyIncome の相互作用特徴量
test_data['Age_MonthlyIncome_Interaction'] = test_data['Age'] * test_data['MonthlyIncome']

# 3. Age と MonthlyIncome のバケット化
test_data['Income_Bucket'] = pd.cut(
    test_data['MonthlyIncome'], 
    bins=[0, 250000, 300000, 350000, 400000, 450000, np.inf], 
    labels=['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh', 'UltraHigh']
)

test_data['Age_Bucket'] = pd.cut(
    test_data['Age'], 
    bins=[0, 20, 30, 40, 50, 60, np.inf], 
    labels=['VeryYoung', 'Young', 'Middle', 'MiddleOld', 'Old', 'VeryOld']
)

# Age_Bucket と Income_Bucket の相互作用特徴量を作成
test_data['Age_Income_Interaction'] = test_data['Age_Bucket'].astype(str) + '_' + test_data['Income_Bucket'].astype(str)
# 各ビンに含まれるデータポイント数を確認
Income_Age_Bucket_bin_counts = test_data['Income_Bucket'].value_counts().sort_index()
# 結果を表示
print(Income_Age_Bucket_bin_counts)

# 4. DurationOfPitch のビン分割
test_data['DurationOfPitch_Category'] = pd.cut(test_data['DurationOfPitch'], 
    bins=[0, 400, 800, 1000, np.inf], labels=['Short', 'Medium', 'Long', 'Verylong'])
# 各ビンに含まれるデータポイント数を確認
DurationOfPitch_Category_bin_counts = test_data['DurationOfPitch_Category'].value_counts().sort_index()
# 結果を表示
print(DurationOfPitch_Category_bin_counts)

# 5. Age x DurationOfPitch
test_data['Age_DurationOfPitch'] = test_data['Age'] * test_data['DurationOfPitch']

# 6. CityTier x Age
test_data['CityTier_Age'] = test_data['CityTier'] * test_data['Age']

# 7. CityTier x MonthlyIncome
test_data['CityTier_MonthlyIncome'] = test_data['CityTier'] * test_data['MonthlyIncome']
# 8. MonthlyIncome / FamilySize
test_data['MonthlyIncome_PerFamily'] = test_data['MonthlyIncome'] / test_data['FamilySize']

# 9. 年齢と収入の比率:
test_data['Income_Per_Age'] = test_data['MonthlyIncome'] / test_data['Age']

# 10. PitchSatisfactionScore / NumberOfPersonVisiting
test_data['Satisfaction_PerVisitor'] = test_data['PitchSatisfactionScore'] / test_data['NumberOfPersonVisiting']

# 年代別にグループ化して平均を計算
test_data['AgeGroup'] = pd.cut(test_data['Age'], bins=[0, 19, 24, 29, 34, 39, 44, 49, 54, 59, 64, 69, 100],
                                labels=['0-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-64', '65-69', '70+'])

# 年代別の平均収入を計算
age_group_avg_income = test_data.groupby('AgeGroup', observed=False)['MonthlyIncome'].mean().reset_index()
age_group_avg_income.columns = ['AgeGroup', 'AverageIncome']

# 元のデータに新しい特徴量として追加
test_data = pd.merge(test_data, age_group_avg_income, on='AgeGroup', how='left')

# 最後に作成した11の特徴量リスト
created_features = [
    'FamilySize',
    'Age_MonthlyIncome_Interaction',
    'Income_Bucket',
    'Age_Bucket',
    'Age_Income_Interaction',
    'DurationOfPitch_Category',
    'Age_DurationOfPitch',
    'CityTier_Age',
    'CityTier_MonthlyIncome',
    'MonthlyIncome_PerFamily',
    'Income_Per_Age',
    'Satisfaction_PerVisitor',
    'AgeGroup',
    'AverageIncome'  # 年代別の平均収入
]

# データセットに含まれているかを確認
missing_features = [feature for feature in created_features if feature not in test_data.columns]

if missing_features:
    print(f"以下の特徴量がデータセットに含まれていません: {missing_features}")
else:
    print("すべての特徴量がデータセットに含まれています。")

# データフレームの先頭を表示して確認（オプション）
print(test_data.dtypes)

# ディレクトリが存在するか確認し、存在しない場合は作成
save_dir = '/Users/hayakawakazue/Downloads/signate/test'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# データを保存
test_data_path = os.path.join(save_dir, 'test_new_features_without_target.csv')
test_data.to_csv(test_data_path, index=False)

print(f"新しい特徴量を作成しデータを保存しました: {test_data_path}")

Income_Bucket
VeryLow        34
Low           847
Medium       1081
High          747
VeryHigh      308
UltraHigh     472
Name: count, dtype: int64
DurationOfPitch_Category
Short        113
Medium      1709
Long         805
Verylong     862
Name: count, dtype: int64
すべての特徴量がデータセットに含まれています。
id                                  int64
Age                                 int64
TypeofContact                      object
CityTier                            int64
DurationOfPitch                     int64
Occupation                         object
Gender                             object
NumberOfPersonVisiting              int64
NumberOfFollowups                   int64
ProductPitched                     object
PreferredPropertyStar               int64
NumberOfTrips                       int64
Passport                            int64
PitchSatisfactionScore              int64
Designation                        object
MonthlyIncome                       int64
MaritalStatus                       I

## 新しい特徴量を追加する
さらに必要な時は追加を考えています。

In [13]:
import pandas as pd
import numpy as np
import os

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_new_features_without_using_target.csv'
test_data = pd.read_csv(data_path, low_memory=False)

# 新しい特徴量の作成
test_data['Visits_to_Income_Ratio'] = test_data['NumberOfPersonVisiting'] / (test_data['MonthlyIncome'] + 1)
test_data['Family_Income_Stability_Index'] = test_data['MonthlyIncome'] / (test_data['FamilySize'] + 1)
test_data['Pitch_Effectiveness_Index'] = test_data['PitchSatisfactionScore'] * test_data['NumberOfFollowups']
test_data['CityTier_Age_NumberOfPersonVisiting'] = test_data['CityTier_Age'] * test_data['NumberOfPersonVisiting']
test_data['MaritalStatus_Income_Interaction'] = test_data['MaritalStatus'] * test_data['MonthlyIncome']
test_data['CityTier_Income_Ratio'] = test_data['CityTier_Age'] / (test_data['MonthlyIncome'] + 1)

# 新しい特徴量を確認
print(test_data.head())

# ディレクトリが存在するか確認し、存在しない場合は作成
save_dir = '/Users/hayakawakazue/Downloads/signate/test'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# データを保存
test_data_path = os.path.join(save_dir, 'test_new_features_add_more.csv')
test_data.to_csv(test_data_path, index=False)

print(f"新しい特徴量を作成しデータを保存しました: {test_data_path}")


     id  Age    TypeofContact  CityTier  DurationOfPitch      Occupation  \
0  3489   48     Self Enquiry         2            780.0  Small Business   
1  3490   30     Self Enquiry         2            720.0  Small Business   
2  3491   25     Self Enquiry         1            540.0        Salaried   
3  3492   21  Company Invited         2            420.0        Salaried   
4  3493   41  Company Invited         1            420.0        Salaried   

   Gender  NumberOfPersonVisiting  NumberOfFollowups ProductPitched  ...  \
0    Male                       1                  4   Super Deluxe  ...   
1  Female                       1                  4       Standard  ...   
2  Female                       1                  4          Basic  ...   
3    Male                       1                  4          Basic  ...   
4    Male                       1                  4          Basic  ...   

   Satisfaction_PerVisitor  AgeGroup  averageIncome  Visits_to_Income_Ratio  \
0      

## 数値変数を確認する

In [4]:
import pandas as pd

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_new_features_without_target.csv'
test_data = pd.read_csv(data_path, low_memory=False, dtype={'MaritalStatus': 'Int64', 'CarCount': 'Int64', 'ChildCount': 'Int64'})

# 数値型の変数のみを抽出
numeric_columns = test_data.select_dtypes(include=['int64', 'float64', 'Int64']).columns

# 数値変数のリストを表示
print("数値変数:")
print(numeric_columns)

print(test_data.info())


数値変数:
Index(['id', 'Age', 'CityTier', 'DurationOfPitch', 'NumberOfPersonVisiting',
       'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips',
       'Passport', 'PitchSatisfactionScore', 'MonthlyIncome', 'MaritalStatus',
       'CarCount', 'ChildCount', 'Age_ProdTaken_Mean', 'Income_ProdTaken_Mean',
       'Age_Target_Enc', 'Income_Target_Enc', 'FamilySize',
       'Age_MonthlyIncome_Interaction', 'Age_DurationOfPitch', 'CityTier_Age',
       'CityTier_MonthlyIncome', 'MonthlyIncome_PerFamily', 'Income_Per_Age',
       'Satisfaction_PerVisitor', 'AverageIncome'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 38 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             3489 non-null   int64  
 1   Age                            3489 non-null   int64  
 2   TypeofContact                  3489 non-null 

## 相関関係の高い特徴量の一方を削除する

In [31]:
import pandas as pd
import os

# データの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_new_features_without_target.csv'
test_data = pd.read_csv(data_path)

# 削除する特徴量のリスト
columns_to_drop = [
    'CityTier_MonthlyIncome', 'Age', 'ChildCount', 'MonthlyIncome_PerFamily',
    'CityTier', 'MonthlyIncome', 'DurationOfPitch'
]

# 指定した特徴量をデータフレームから削除
test_with_reduced_features = test_data.drop(columns=columns_to_drop)

# ディレクトリが存在するか確認し、存在しない場合は作成
save_dir = '/Users/hayakawakazue/Downloads/signate/test'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# 新しい特徴量を含むデータを保存
test_data_path = os.path.join(save_dir, 'test_with_reduced_features.csv')
test_with_reduced_features.to_csv(test_data_path, index=False)

print(f"指定した特徴量を削除したデータを保存しました: {test_data_path}")

print(test_with_reduced_features.dtypes)

指定した特徴量を削除したデータを保存しました: /Users/hayakawakazue/Downloads/signate/test/test_with_reduced_features.csv
id                                 int64
TypeofContact                     object
Occupation                        object
Gender                            object
NumberOfPersonVisiting             int64
NumberOfFollowups                  int64
ProductPitched                    object
PreferredPropertyStar              int64
NumberOfTrips                      int64
Passport                           int64
PitchSatisfactionScore             int64
Designation                       object
MaritalStatus                      int64
CarCount                           int64
Age_ProdTaken_Mean               float64
Income_ProdTaken_Mean            float64
FamilySize                         int64
Age_MonthlyIncome_Interaction      int64
Income_Bucket                     object
Age_Bucket                        object
Age_Income_Interaction            object
DurationOfPitch_Category          object

## カテゴリ変数を確認する

In [13]:
import pandas as pd

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_with_reduced7_features.csv'
test_data = pd.read_csv(data_path, low_memory=False, dtype={'MaritalStatus': 'Int64', 'CarCount': 'Int64', 'ChildCount': 'Int64'})

# カテゴリ型の変数のみを抽出
categorical_columns = test_data.select_dtypes(include=['object', 'category']).columns

# カテゴリ変数のリストを表示
print("カテゴリ変数:")
print(categorical_columns)

# カテゴリ変数のデータを表示（必要に応じて先頭の数行だけ表示）
print("\nカテゴリ変数のデータ:")
print(test_data[categorical_columns].head())


カテゴリ変数:
Index(['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',
       'Designation', 'Income_Bucket', 'Age_Bucket', 'Age_Income_Interaction',
       'DurationOfPitch_Category', 'Age_ProdTaken_Bucket', 'AgeGroup'],
      dtype='object')

カテゴリ変数のデータ:
     TypeofContact      Occupation  Gender ProductPitched     Designation  \
0     Self Enquiry  Small Business    Male   Super Deluxe             AVP   
1     Self Enquiry  Small Business  Female       Standard  Senior Manager   
2     Self Enquiry        Salaried  Female          Basic       Executive   
3  Company Invited        Salaried    Male          Basic  Senior Manager   
4  Company Invited        Salaried    Male          Basic       Executive   

  Income_Bucket Age_Bucket Age_Income_Interaction DurationOfPitch_Category  \
0     UltraHigh  MiddleOld    MiddleOld_UltraHigh                   Medium   
1           Low      Young              Young_Low                   Medium   
2           Low      Young              You

## カテゴリ変数に含まれるユニークな値を確認する

In [15]:
import pandas as pd

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_with_reduced7_features.csv'
test_data = pd.read_csv(data_path, low_memory=False, dtype={'MaritalStatus': 'Int64', 'CarCount': 'Int64', 'ChildCount': 'Int64'})

# Age_Income_Interaction列のユニークな値を確認
print(test_data['Age_Income_Interaction'].unique())

['MiddleOld_UltraHigh' 'Young_Low' 'MiddleOld_Low' 'MiddleOld_Medium'
 'MiddleOld_High' 'Young_VeryLow' 'Middle_Low' 'MiddleOld_VeryHigh'
 'VeryYoung_Low' 'Old_UltraHigh' 'Middle_VeryHigh' 'Middle_High'
 'Old_VeryHigh' 'Middle_VeryLow' 'VeryYoung_Medium' 'Middle_UltraHigh'
 'Old_High' 'Middle_Medium' 'Old_Medium' 'Young_High' 'Old_Low'
 'Young_UltraHigh' 'Young_Medium' 'Young_VeryHigh' 'VeryYoung_High'
 'VeryYoung_VeryHigh' 'VeryOld_UltraHigh' 'MiddleOld_VeryLow'
 'VeryOld_VeryHigh' 'VeryOld_Medium']


## カテゴリ変数のランク付を定義する

In [16]:
import pandas as pd
import os

# テストデータの読み込み
test_data_path = '/Users/hayakawakazue/Downloads/signate/test/test_new_features_without_target.csv'
test_data = pd.read_csv(test_data_path)

# 1.ProductPitchedのランク付けを定義
product_pitched_ranks = {
    'Basic': 1,
    'Standard': 2,
    'Deluxe': 3,
    'Super Deluxe': 4,
    'King': 5
}
# ランク付けを行う新しい列を作成
test_data['ProductPitched_Rank'] = test_data['ProductPitched'].map(product_pitched_ranks)

# 2.役職のランク付けを定義
designation_ranks = {
    'VP': 5,
    'AVP': 4,
    'Senior Manager': 3,
    'Manager': 2,
    'Executive': 1
}
# ランク付けを行う新しい列を作成
test_data['Designation_Rank'] = test_data['Designation'].map(designation_ranks)

# 3.Income_Bucketのランク付けを定義
income_bucket_ranks = {
    'VeryLow': 1,
    'Low': 2,
    'Medium': 3,
    'High': 4,
    'VeryHigh': 5,
    'UltraHigh': 6
}
# ランク付けを行う新しい列を作成
test_data['Income_Bucket_Rank'] = test_data['Income_Bucket'].map(income_bucket_ranks)

# 4.Age_Bucketのランク付けを定義
age_bucket_ranks = {
    'VeryYoung': 1,
    'Young': 2,
    'Middle': 3,
    'MiddleOld': 4,
    'Old': 5,
    'VeryOld': 6
}
# ランク付けを行う新しい列を作成
test_data['Age_Bucket_Rank'] = test_data['Age_Bucket'].map(age_bucket_ranks)

# 5-1.Age_Bucketのランク付けを定義
age_bucket_ranks = {
    'VeryYoung': 1,
    'Young': 2,
    'Middle': 3,
    'MiddleOld': 4,
    'Old': 5,
    'VeryOld': 6
}

# 5-2.Income_Bucketのランク付けを定義
income_bucket_ranks = {
    'VeryLow': 1,
    'Low': 2,
    'Medium': 3,
    'High': 4,
    'VeryHigh': 5,
    'UltraHigh': 6
}

# 5-3.Age_Income_Interactionのランク付けを計算する関数
def rank_age_income_interaction(value):
    try:
        # 最初のアンダースコアで分割
        split_index = value.rfind('_')
        age_part = value[:split_index]
        income_part = value[split_index + 1:]
        
        age_rank = age_bucket_ranks[age_part]
        income_rank = income_bucket_ranks[income_part]
        
        return age_rank * income_rank
    except ValueError:
        return np.nan  # 期待外の値があればNaNを返す

# Age_Income_Interactionのランク付けを行う新しい列を作成
test_data['Age_Income_Interaction_Rank'] = test_data['Age_Income_Interaction'].map(rank_age_income_interaction)

# 6.DurationOfPitch_Categoryのランク付けを定義
duration_of_pitch_ranks = {
    'Short': 1,
    'Medium': 2,
    'Long': 3,
    'VeryLong': 4
}
# ランク付けを行う新しい列を作成
test_data['DurationOfPitch_Category_Rank'] = test_data['DurationOfPitch_Category'].map(duration_of_pitch_ranks)

# 7.AgeGroupのランク付けを定義
age_group_ranks = {
    '0-19': 1,
    '20-24': 2,
    '25-29': 3,
    '30-34': 4,
    '35-39': 5,
    '40-44': 6,
    '45-49': 7,
    '50-54': 8,
    '55-59': 9,
    '60-64': 10
}

# ランク付けを行う新しい列を作成
test_data['AgeGroup_Rank'] = test_data['AgeGroup'].map(age_group_ranks)

# ラベルエンコーディングされたデータを含むデータセットを保存
output_path = '/Users/hayakawakazue/Downloads/signate/test/test_mapping.csv'
test_data.to_csv(output_path, index=False)
print(f"ランク付けを含むデータを保存しました: {output_path}")

# 結果を確認
print(test_data.info())


ランク付けを含むデータを保存しました: /Users/hayakawakazue/Downloads/signate/test/test_mapping.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 40 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             3489 non-null   int64  
 1   Age                            3489 non-null   int64  
 2   TypeofContact                  3489 non-null   object 
 3   CityTier                       3489 non-null   int64  
 4   DurationOfPitch                3489 non-null   int64  
 5   Occupation                     3489 non-null   object 
 6   Gender                         3489 non-null   object 
 7   NumberOfPersonVisiting         3489 non-null   int64  
 8   NumberOfFollowups              3489 non-null   int64  
 9   ProductPitched                 3489 non-null   object 
 10  PreferredPropertyStar          3489 non-null   int64  
 11  NumberOfTrips              

## ワンホットエンコーディングを適用する

In [19]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import joblib

# testデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_mapping.csv'
test_data = pd.read_csv(data_path, low_memory=False)

# ワンホットエンコーダをロードする
onehot_encoder_path = '/Users/hayakawakazue/Downloads/signate/train/onehot_encoder.pkl'
onehot_encoder = joblib.load(onehot_encoder_path)

# ワンホットエンコーディングを適用するカテゴリ変数
categorical_columns = [
    'TypeofContact', 'Occupation', 'Gender'
]

# カテゴリ変数がテストデータに存在するか確認
for column in categorical_columns:
    if column not in test_data.columns:
        raise ValueError(f"テストデータに '{column}' 列が存在しません。")

# ワンホットエンコーディングを適用
encoded_categorical_data = onehot_encoder.transform(test_data[categorical_columns])

# データフレームに変換
encoded_categorical_df = pd.DataFrame(encoded_categorical_data, columns=onehot_encoder.get_feature_names_out(categorical_columns))

# 元のデータフレームにエンコードしたデータを結合
test_data = test_data.drop(columns=categorical_columns)
test_data = pd.concat([test_data.reset_index(drop=True), encoded_categorical_df.reset_index(drop=True)], axis=1)

# データの確認
print(test_data.head())

# 新しい特徴量を含むデータを保存
test_encoded_path = '/Users/hayakawakazue/Downloads/signate/test/test_onehot_encoded.csv'
test_data.to_csv(test_encoded_path, index=False)
print(f"エンコードされたデータを保存しました: {test_encoded_path}")

     id  Age  CityTier  DurationOfPitch  NumberOfPersonVisiting  \
0  3489   48         2              780                       1   
1  3490   30         2              720                       1   
2  3491   25         1              540                       1   
3  3492   21         2              420                       1   
4  3493   41         1              420                       1   

   NumberOfFollowups ProductPitched  PreferredPropertyStar  NumberOfTrips  \
0                  4   Super Deluxe                      3              7   
1                  4       Standard                      3              4   
2                  4          Basic                      3              1   
3                  4          Basic                      4              1   
4                  4          Basic                      3              1   

   Passport  ...  Age_Income_Interaction_Rank DurationOfPitch_Category_Rank  \
0         0  ...                           24          

## 列の確認をする

In [18]:
import pandas as pd
import numpy as np

# テストデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_onehot_encoded.csv'
test_data = pd.read_csv(data_path, low_memory=False)

print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 44 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   id                             3489 non-null   int64  
 1   Age                            3489 non-null   int64  
 2   CityTier                       3489 non-null   int64  
 3   DurationOfPitch                3489 non-null   int64  
 4   NumberOfPersonVisiting         3489 non-null   int64  
 5   NumberOfFollowups              3489 non-null   int64  
 6   ProductPitched                 3489 non-null   object 
 7   PreferredPropertyStar          3489 non-null   int64  
 8   NumberOfTrips                  3489 non-null   int64  
 9   Passport                       3489 non-null   int64  
 10  PitchSatisfactionScore         3489 non-null   int64  
 11  Designation                    3489 non-null   object 
 12  MonthlyIncome                  3489 non-null   i

## 新しい特徴量を追加する

In [20]:
import pandas as pd
import numpy as np

# データを読み込む
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_onehot_encoded.csv'
test_data = pd.read_csv(data_path)

# 交互作用特徴量の作成
test_data['CityTier_Age_Income_Interaction'] = test_data['CityTier_Age'] * test_data['Income_Per_Age']
test_data['FamilySize_Income_Interaction'] = test_data['FamilySize'] * test_data['Income_Per_Age']

# 派生特徴量の作成
test_data['TotalTrips'] = test_data['NumberOfTrips'] * test_data['DurationOfPitch_Category_Rank']
test_data['EngagementScore'] = test_data['NumberOfFollowups'] * test_data['PitchSatisfactionScore']

# 行動特徴量の作成
test_data['ProductPitched_Satisfaction'] = test_data['ProductPitched_Rank'] * test_data['PitchSatisfactionScore']
test_data['Designation_PropertyStar'] = test_data['Designation_Rank'] * test_data['PreferredPropertyStar']

# データを保存
output_path = '/Users/hayakawakazue/Downloads/signate/test/test_add_new_features0901_1.csv'
test_data.to_csv(output_path, index=False)
print(f"新しい特徴量を追加したデータを保存しました: {output_path}")


新しい特徴量を追加したデータを保存しました: /Users/hayakawakazue/Downloads/signate/test/test_add_new_features0901_1.csv


# 双子の関係にある特徴量の一方を削除する

In [8]:
import pandas as pd

# テストデータを読み込む
test_data_path = '/Users/hayakawakazue/Downloads/signate/test/test_with_cluster_features.csv'
test_data = pd.read_csv(test_data_path)

# 削除する特徴量のリスト
columns_to_drop = [
    'Gender_Male',
    'TypeofContact_Self Enquiry',
    'AgeGroup_Rank',
    'Age_Income_Interaction_Rank',
    'Designation_PropertyStar',
    'Age_Bucket_Rank',
    'PitchSatisfactionScore',
    'Age_ProdTaken_Bucket_Rank',
    'AverageIncome',
    'CityTier_Age',
    'TotalTrips'
]

# 特徴量を削除
test_data = test_data.drop(columns=columns_to_drop)

# 結果を確認
print(test_data.dtypes)

# 保存
output_test_path = '/Users/hayakawakazue/Downloads/signate/test/test_with_reduced_AverageIncome.csv'
test_data.to_csv(output_test_path, index=False)
print(f"双子の関係にある一方の特徴量を削除しデータセットを保存しました: {output_test_path}")


id                                   int64
NumberOfPersonVisiting               int64
NumberOfFollowups                    int64
ProductPitched                      object
PreferredPropertyStar                int64
NumberOfTrips                        int64
Passport                             int64
Designation                         object
MaritalStatus                        int64
CarCount                             int64
Age_ProdTaken_Mean                 float64
Income_ProdTaken_Mean              float64
FamilySize                           int64
Age_MonthlyIncome_Interaction        int64
Income_Bucket                       object
Age_Bucket                          object
Age_Income_Interaction              object
DurationOfPitch_Category            object
Age_ProdTaken_Bucket                object
Age_DurationOfPitch                  int64
Income_Per_Age                     float64
Satisfaction_PerVisitor            float64
AgeGroup                            object
ProductPitc

# 新しい特徴量を追加する

In [21]:
import pandas as pd
import numpy as np

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_add_new_features0901_1.csv'
test_data = pd.read_csv(data_path, low_memory=False)

# CityTier_Age_Income_と_FamilySize_Incomeを掛け合わせる
test_data['CityTier_Age_Income_x_FamilySize_Income'] = (
    test_data['CityTier_Age_Income_Interaction'] * 
    test_data['FamilySize_Income_Interaction']
)

# 新しいデータセットの保存
output_path = '/Users/hayakawakazue/Downloads/signate/test/test_with_CityTierAgeIncomeFamilySizeIncome.csv'
test_data.to_csv(output_path, index=False)
print(f"CityTier_Age_Income_x_FamilySize_Incomeを含むデータセットを {output_path} に保存しました。")

CityTier_Age_Income_x_FamilySize_Incomeを含むデータセットを /Users/hayakawakazue/Downloads/signate/test/test_with_CityTierAgeIncomeFamilySizeIncome.csv に保存しました。


## スケーリングを適用する

In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import joblib

# テストデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_with_CityTierAgeIncomeFamilySizeIncome.csv'
test_data = pd.read_csv(data_path, low_memory=False)

# ID列をスケーリングから除外
id_column = test_data['id']
X_test_numeric = test_data.drop(columns=['id'])

# スケーリングを行うカラムを指定（これまでのcolumns_to_scaleを使用）
columns_to_scale = [
    'Age', 'CityTier', 'DurationOfPitch',
    'NumberOfPersonVisiting', 'NumberOfFollowups','PreferredPropertyStar', 
    'NumberOfTrips', 'PitchSatisfactionScore', 'MonthlyIncome', 'FamilySize',
    'Age_DurationOfPitch', 'CityTier_Age', 'CityTier_MonthlyIncome', 'MonthlyIncome_PerFamily',
    'Income_Per_Age', 'Satisfaction_PerVisitor', 'AverageIncome', 'ProductPitched_Rank',
    'Designation_Rank', 'Income_Bucket_Rank', 'Age_Bucket_Rank', 'Age_Income_Interaction_Rank', 
    'DurationOfPitch_Category_Rank', 'AgeGroup_Rank', 'TypeofContact_Company_Invited',
    'TypeofContact_Self_Enquiry', 'Occupation_Large_Business', 'Occupation_Salaried',
    'Occupation_Small_Business', 'CityTier_Age_Income_Interaction', 'FamilySize_Income_Interaction', 
    'TotalTrips', 'EngagementScore', 'ProductPitched_Satisfaction', 'Designation_PropertyStar', 
    'CityTier_Age_Income_x_FamilySize_Income'
]

# 無限大の値や非常に大きな値の処理を各カラムで個別に行う
for col in columns_to_scale:
    X_test_numeric[col] = X_test_numeric[col].replace([np.inf, -np.inf], np.nan)

# NaN値を平均で補完
for col in columns_to_scale:
    X_test_numeric[col] = X_test_numeric[col].fillna(X_test_numeric[col].mean())

# スケーラーをロードしてスケーリング
scaler_path = '/Users/hayakawakazue/Downloads/signate/train/train_scaler.pkl'
scaler = joblib.load(scaler_path)
X_test_numeric[columns_to_scale] = scaler.transform(X_test_numeric[columns_to_scale])

# スケーリング後にID列を再度結合
X_test_scaled = pd.concat([id_column, X_test_numeric], axis=1)

# スケーリング後のデータを保存
output_path = '/Users/hayakawakazue/Downloads/signate/test/test_scaled.csv'
X_test_scaled.to_csv(output_path, index=False)
print(f"ID列を保持したままスケーリング済みのデータを保存しました: {output_path}")


ID列を保持したままスケーリング済みのデータを保存しました: /Users/hayakawakazue/Downloads/signate/test/test_scaled.csv


## データ型の確認

In [12]:
import pandas as pd
import numpy as np

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_scaled.csv'
test_data = pd.read_csv(data_path, low_memory=False)

print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 50 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   id                                       3489 non-null   int64  
 1   NumberOfPersonVisiting                   3489 non-null   float64
 2   NumberOfFollowups                        3489 non-null   int64  
 3   ProductPitched                           3489 non-null   object 
 4   PreferredPropertyStar                    3489 non-null   float64
 5   NumberOfTrips                            3489 non-null   int64  
 6   Passport                                 3489 non-null   int64  
 7   PitchSatisfactionScore                   3489 non-null   float64
 8   Designation                              3489 non-null   object 
 9   MaritalStatus                            3489 non-null   int64  
 10  CarCount                                 3489 no

## 不要な列を削除する

In [23]:
import pandas as pd
import numpy as np

# トレインデータの読み込み
data_path = '/Users/hayakawakazue/Downloads/signate/test/test_scaled.csv'
test_data = pd.read_csv(data_path, low_memory=False)

# 不要な `object` 型の列とCarCountを削除
test_data = test_data.drop(columns=[
    'ProductPitched', 'Designation', 'Income_Bucket', 'Age_Bucket',
    'Age_Income_Interaction', 'DurationOfPitch_Category','AgeGroup',
    'CarCount', 'Age_DurationOfPitch', 'DurationOfPitch_Category_Rank',
    'CityTier_Age_Income_Interaction', 'FamilySize_Income_Interaction'
])

# データ型の確認
print(test_data.info())

# データの保存
output_path = '/Users/hayakawakazue/Downloads/signate/test/test_proceed_0901_1.csv'
test_data.to_csv(output_path, index=False)
print(f"不要な列を削除してデータを保存しました: {output_path}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3489 entries, 0 to 3488
Data columns (total 39 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   id                                       3489 non-null   int64  
 1   Age                                      3489 non-null   float64
 2   CityTier                                 3489 non-null   float64
 3   DurationOfPitch                          3489 non-null   float64
 4   NumberOfPersonVisiting                   3489 non-null   float64
 5   NumberOfFollowups                        3489 non-null   float64
 6   PreferredPropertyStar                    3489 non-null   float64
 7   NumberOfTrips                            3489 non-null   float64
 8   Passport                                 3489 non-null   int64  
 9   PitchSatisfactionScore                   3489 non-null   float64
 10  MonthlyIncome                            3489 no