In [1]:
from google.colab import drive

# Google Driveをマウント
drive.mount('/content/drive')

# マウント完了後のメッセージ
print("Google Driveがマウントされました！")


Mounted at /content/drive
Google Driveがマウントされました！


In [5]:
import pandas as pd

# CSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/train.csv'
train_data = pd.read_csv(file_path, low_memory= False)

# 読み込んだデータを表示
train_data.head()


Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,loan_status
0,88194295,1800.0,3 years,14.49,C4,,debt_consolidation,665.0,Individual,FullyPaid
1,5146039,1200.0,5 years,16.29,C4,2 years,debt_consolidation,700.0,Individual,ChargedOff
2,3095896,2000.0,5 years,21.98,E4,10 years,home_improvement,670.0,Individual,FullyPaid
3,88625044,1000.0,3 years,8.59,A5,4 years,debt_consolidation,710.0,Individual,FullyPaid
4,1178189,1500.0,3 years,13.99,C1,4 years,debt_consolidation,680.0,Individual,FullyPaid


In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242156 entries, 0 to 242155
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   id                 242156 non-null  int64  
 1   loan_amnt          242156 non-null  float64
 2   term               242156 non-null  object 
 3   interest_rate      242156 non-null  float64
 4   grade              242156 non-null  object 
 5   employment_length  228971 non-null  object 
 6   purpose            242156 non-null  object 
 7   credit_score       242156 non-null  float64
 8   application_type   242156 non-null  object 
 9   loan_status        242156 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 18.5+ MB


In [30]:
# 列のユニークな値
print("Unique values in 'interest_rate':", train_data['interest_rate'].unique())

# 列のユニークな値の総数
unique_count = train_data['interest_rate'].nunique(dropna=False)

print(f"Total unique values in 'interest_rate': {unique_count}")

Unique values in 'interest_rate': [14.49 16.29 21.98  8.59 13.99  7.35 13.49  9.49 18.99  9.93 11.44 23.99
 11.49  8.9  15.8  16.99  5.32 19.99 10.99 14.09 23.28 12.79 21.97 10.16
 17.27 14.08 17.99  6.03 12.12 12.74 16.02  7.49 10.74 13.11  9.76  9.71
 22.74  7.99  7.9  14.33 15.59 23.88 25.69 10.49 12.62 21.   24.49 21.45
  9.91 11.55 18.49 15.31  8.24 18.75  7.21 12.35 17.56 11.14  7.97 13.68
 24.89 14.65 10.42  7.59 19.03 19.05  7.62 22.39 28.18 21.49 17.77 10.91
 18.06 15.05 24.7   6.62 13.59 25.82 14.99 15.99 20.49 20.   22.47  8.99
 15.88 18.55 24.83 26.99 19.72  9.44 26.49  7.24 22.95 15.22  6.99  6.72
 18.25 24.99 17.09 28.69 23.83 11.99 13.67 21.6  26.24 20.31 30.75 11.39
 16.77 13.05 17.58 20.89 19.52  6.08  6.   30.74  7.51 24.74 22.45 19.22
 22.91 10.64 20.5  24.08 30.79 27.79 30.99 29.96 26.3  25.49 12.42 22.78
 15.96 28.72 15.27 27.49 23.63 25.89 22.2  22.7  23.26 12.99 29.67 30.65
 18.85 16.78 10.65 24.85 14.27 21.48 25.29 12.69 29.69 29.49 19.42 15.81
 30.84 20.8  11.7

In [33]:
import numpy as np

# id列を削除
df = train_data.drop(columns=['id'])

# 'years' を削除し、< 1を0に置き換え
df['term'] = df['term'].str.replace(' years', '').str.strip()
df['employment_length'] = df['employment_length'].str.replace(' years', '').str.replace('< 1', '0').str.strip()

# 数値型（float）に変換
df['term'] = df['term'].astype(float)
df['employment_length'] = pd.to_numeric(df['employment_length'], errors='coerce').astype(float)

# 月利と返済回数の準備
df['monthly_interest_rate'] = df['interest_rate'] / 100 / 12  # 月利に変換
df['term_months'] = df['term'] * 12  # 返済回数（月数）

# 毎月の返済額を計算する関数
def calculate_monthly_payment(loan_amnt, monthly_rate, term_months):
    if monthly_rate > 0:  # 月利が0でない場合
        return loan_amnt * (monthly_rate * (1 + monthly_rate)**term_months) / ((1 + monthly_rate)**term_months - 1)
    else:  # 月利が0の場合（単純に元金を均等割）
        return loan_amnt / term_months

# 毎月の返済額を計算
df['monthly_payment'] = df.apply(lambda row: calculate_monthly_payment(
    row['loan_amnt'], row['monthly_interest_rate'], row['term_months']
), axis=1)

# 総返済額（毎月の返済額 × 返済回数）
df['total_payment'] = df['monthly_payment'] * df['term_months']

# 負担指数の計算
df['burden_index'] = df['total_payment'] / df['credit_score']

# FullyPaidを0、ChargedOffを1に変換
df['loan_status'] = df['loan_status'].map({'FullyPaid': 0, 'ChargedOff': 1}).astype(int)


# 結果を確認
df.head()


Unnamed: 0,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,loan_status,monthly_interest_rate,term_months,monthly_payment,total_payment,burden_index
0,1800.0,3.0,14.49,C4,,debt_consolidation,665.0,Individual,0,0.012075,36.0,61.948981,2230.163322,3.353629
1,1200.0,5.0,16.29,C4,2.0,debt_consolidation,700.0,Individual,1,0.013575,60.0,29.366894,1762.013668,2.517162
2,2000.0,5.0,21.98,E4,10.0,home_improvement,670.0,Individual,0,0.018317,60.0,55.215084,3312.905027,4.944634
3,1000.0,3.0,8.59,A5,4.0,debt_consolidation,710.0,Individual,0,0.007158,36.0,31.609257,1137.933255,1.602723
4,1500.0,3.0,13.99,C1,4.0,debt_consolidation,680.0,Individual,0,0.011658,36.0,51.25916,1845.329745,2.71372


In [34]:
# 結果を確認
print(df['loan_status'].value_counts())

loan_status
0    193815
1     48341
Name: count, dtype: int64


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 242156 entries, 0 to 242155
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   loan_amnt              242156 non-null  float64
 1   term                   242156 non-null  float64
 2   interest_rate          242156 non-null  float64
 3   grade                  242156 non-null  object 
 4   employment_length      213088 non-null  float64
 5   purpose                242156 non-null  object 
 6   credit_score           242156 non-null  float64
 7   application_type       242156 non-null  object 
 8   loan_status            242156 non-null  int64  
 9   monthly_interest_rate  242156 non-null  float64
 10  term_months            242156 non-null  float64
 11  monthly_payment        242156 non-null  float64
 12  total_payment          242156 non-null  float64
 13  burden_index           242156 non-null  float64
dtypes: float64(10), int64(1), object(3)


In [36]:
import os

# 保存先のディレクトリを指定
output_dir = "/content/drive/My Drive/signate"
os.makedirs(output_dir, exist_ok=True)  # ディレクトリが存在しない場合は作成

# 保存ファイル名を指定
output_file = os.path.join(output_dir, "train_with_features.csv")

# データセットを保存
df.to_csv(output_file, index=False)

print(f"データセットが保存されました: {output_file}")


データセットが保存されました: /content/drive/My Drive/signate/train_with_features.csv
