In [15]:
from google.colab import drive

# Google Driveをマウント
drive.mount('/content/drive')

# マウント完了後のメッセージ
print("Google Driveがマウントされました！")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Driveがマウントされました！


In [19]:
import pandas as pd

# CSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/test.csv'
test_data = pd.read_csv(file_path, low_memory= False)

# 読み込んだデータを表示
test_data.head()


Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type
0,1496754,1912.5,3 years,10.16,B1,5 years,debt_consolidation,725.0,Individual
1,84909594,1800.0,3 years,8.99,B1,9 years,credit_card,695.0,Individual
2,1165403,550.0,3 years,14.65,C2,10 years,credit_card,660.0,Individual
3,91354446,2000.0,5 years,15.59,C5,10 years,credit_card,695.0,Individual
4,85636932,1500.0,5 years,12.79,C1,0 years,medical,720.0,Individual


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26906 entries, 0 to 26905
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 26906 non-null  int64  
 1   loan_amnt          26906 non-null  float64
 2   term               26906 non-null  object 
 3   interest_rate      26906 non-null  float64
 4   grade              26906 non-null  object 
 5   employment_length  25466 non-null  object 
 6   purpose            26906 non-null  object 
 7   credit_score       26906 non-null  float64
 8   application_type   26906 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 1.8+ MB


In [None]:
# 列のユニークな値
print("Unique values in 'application_type':", test_data['application_type'].unique())

# 列のユニークな値の総数
unique_count = test_data['application_type'].nunique(dropna=False)

print(f"Total unique values in 'application_type': {unique_count}")

Unique values in 'application_type': ['Individual' 'Joint App']
Total unique values in 'application_type': 2


# 前処理

In [20]:
import re
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import pickle

# CSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/test.csv'
test_df = pd.read_csv(file_path, low_memory= False)

# 'employment_length' の処理関数
def process_employment_length(value):
    if pd.isna(value):  # NaN の場合
        return np.nan
    # 正規表現で 'years' と 'year' を削除し、'< 1' を '0' に置き換え
    value = re.sub(r' years| year', '', value)
    value = re.sub(r'< 1', '0', value)
    return float(value.strip())  # 数値型に変換

# employment_length 列の処理
test_df['employment_length'] = test_df['employment_length'].apply(process_employment_length)

# 'term' の処理
test_df['term'] = test_df['term'].apply(
    lambda x: float(re.sub(r' years', '', x).strip()) if isinstance(x, str) else np.nan
)

# interest_rate の正規化
test_df['interest_rate'] = test_df['interest_rate'] / 100  # パーセントを小数に変換


# Google Drive 内のファイルをロード
with open('/content/drive/My Drive/grade_encoder.pkl', 'rb') as file:
    loaded_encoder = pickle.load(file)

# テストデータにエンコーダーを適用
test_df['grade_encoded'] = test_df['grade'].map(
    lambda x: loaded_encoder.transform([x])[0] if x in loaded_encoder.classes_ else -1
)

# 月利と返済回数の準備
test_df['monthly_interest_rate'] = test_df['interest_rate'] / 12  # 月利に変換
test_df['term_months'] = test_df['term'] * 12  # 返済回数（月数）

# 毎月の返済額を計算する関数
def calculate_monthly_payment(loan_amnt, monthly_rate, term_months):
    if monthly_rate > 0:  # 月利が0でない場合
        return loan_amnt * (monthly_rate * (1 + monthly_rate)**term_months) / ((1 + monthly_rate)**term_months - 1)
    else:  # 月利が0の場合（単純に元金を均等割）
        return loan_amnt / term_months

# 毎月の返済額を計算
test_df['monthly_payment'] = test_df.apply(lambda row: calculate_monthly_payment(
    row['loan_amnt'], row['monthly_interest_rate'], row['term_months']
), axis=1)

# 総返済額（毎月の返済額 × 返済回数）
test_df['total_payment'] = test_df['monthly_payment'] * test_df['term_months']

# 負担指数の計算
test_df['burden_index'] = test_df['total_payment'] / test_df['credit_score']


# 結果を確認
test_df.head()



Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,grade_encoded,monthly_interest_rate,term_months,monthly_payment,total_payment,burden_index
0,1496754,1912.5,3.0,0.1016,B1,5.0,debt_consolidation,725.0,Individual,5,0.008467,36.0,61.854761,2226.771378,3.071409
1,84909594,1800.0,3.0,0.0899,B1,9.0,credit_card,695.0,Individual,5,0.007492,36.0,57.231142,2060.321102,2.964491
2,1165403,550.0,3.0,0.1465,C2,10.0,credit_card,660.0,Individual,11,0.012208,36.0,18.971798,682.984714,1.034825
3,91354446,2000.0,5.0,0.1559,C5,10.0,credit_card,695.0,Individual,14,0.012992,60.0,48.201497,2892.089846,4.16128
4,85636932,1500.0,5.0,0.1279,C1,0.0,medical,720.0,Individual,10,0.010658,60.0,33.968575,2038.114488,2.830715


In [21]:
import numpy as np

# total_employment_length の計算
test_df["total_employment_length"] = np.where(
    test_df["employment_length"].isna(),  # employment_length が NaN かをチェック
    0 + test_df["term"],                 # NaN の場合
    test_df["employment_length"] + test_df["term"]  # NaN でない場合
)


# 結果の確認
test_df.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,grade_encoded,monthly_interest_rate,term_months,monthly_payment,total_payment,burden_index,total_employment_length
0,1496754,1912.5,3.0,0.1016,B1,5.0,debt_consolidation,725.0,Individual,5,0.008467,36.0,61.854761,2226.771378,3.071409,8.0
1,84909594,1800.0,3.0,0.0899,B1,9.0,credit_card,695.0,Individual,5,0.007492,36.0,57.231142,2060.321102,2.964491,12.0
2,1165403,550.0,3.0,0.1465,C2,10.0,credit_card,660.0,Individual,11,0.012208,36.0,18.971798,682.984714,1.034825,13.0
3,91354446,2000.0,5.0,0.1559,C5,10.0,credit_card,695.0,Individual,14,0.012992,60.0,48.201497,2892.089846,4.16128,15.0
4,85636932,1500.0,5.0,0.1279,C1,0.0,medical,720.0,Individual,10,0.010658,60.0,33.968575,2038.114488,2.830715,5.0


In [23]:
import os

# 保存先のディレクトリを指定
output_dir = "/content/drive/My Drive/signate"
os.makedirs(output_dir, exist_ok=True)  # ディレクトリが存在しない場合は作成

# 保存ファイル名を指定
output_file = os.path.join(output_dir, "test_1221_5.csv")

# データセットを保存
test_df.to_csv(output_file, index=False)

print(f"データセットが保存されました: {output_file}")

データセットが保存されました: /content/drive/My Drive/signate/test_1221_5.csv


In [25]:
test_df.head()

Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,grade_encoded,monthly_interest_rate,term_months,monthly_payment,total_payment,burden_index,total_employment_length
0,1496754,1912.5,3.0,0.1016,B1,5.0,debt_consolidation,725.0,Individual,5,0.008467,36.0,61.854761,2226.771378,3.071409,8.0
1,84909594,1800.0,3.0,0.0899,B1,9.0,credit_card,695.0,Individual,5,0.007492,36.0,57.231142,2060.321102,2.964491,12.0
2,1165403,550.0,3.0,0.1465,C2,10.0,credit_card,660.0,Individual,11,0.012208,36.0,18.971798,682.984714,1.034825,13.0
3,91354446,2000.0,5.0,0.1559,C5,10.0,credit_card,695.0,Individual,14,0.012992,60.0,48.201497,2892.089846,4.16128,15.0
4,85636932,1500.0,5.0,0.1279,C1,0.0,medical,720.0,Individual,10,0.010658,60.0,33.968575,2038.114488,2.830715,5.0
