In [1]:
from google.colab import drive

# Google Driveをマウント
drive.mount('/content/drive')

# マウント完了後のメッセージ
print("Google Driveがマウントされました！")


Mounted at /content/drive
Google Driveがマウントされました！


In [2]:
import pandas as pd

# CSVファイルを読み込む
file_path = '/content/drive/My Drive/signate/test.csv'
test_data = pd.read_csv(file_path, low_memory= False)

# 読み込んだデータを表示
test_data.head()


Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type
0,1496754,1912.5,3 years,10.16,B1,5 years,debt_consolidation,725.0,Individual
1,84909594,1800.0,3 years,8.99,B1,9 years,credit_card,695.0,Individual
2,1165403,550.0,3 years,14.65,C2,10 years,credit_card,660.0,Individual
3,91354446,2000.0,5 years,15.59,C5,10 years,credit_card,695.0,Individual
4,85636932,1500.0,5 years,12.79,C1,0 years,medical,720.0,Individual


In [3]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26906 entries, 0 to 26905
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 26906 non-null  int64  
 1   loan_amnt          26906 non-null  float64
 2   term               26906 non-null  object 
 3   interest_rate      26906 non-null  float64
 4   grade              26906 non-null  object 
 5   employment_length  25466 non-null  object 
 6   purpose            26906 non-null  object 
 7   credit_score       26906 non-null  float64
 8   application_type   26906 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 1.8+ MB


In [11]:
# 列のユニークな値
print("Unique values in 'application_type':", test_data['application_type'].unique())

# 列のユニークな値の総数
unique_count = test_data['application_type'].nunique(dropna=False)

print(f"Total unique values in 'application_type': {unique_count}")

Unique values in 'application_type': ['Individual' 'Joint App']
Total unique values in 'application_type': 2


# 前処理

In [14]:
import numpy as np

# 列全体を文字列型に変換して処理
test_data['term'] = test_data['term'].astype(str).str.replace(' years', '').str.strip()
test_data['employment_length'] = test_data['employment_length'].astype(str).str.replace(' years', '').str.replace('< 1', '0').str.strip()

# 数値型（float）に変換
test_data['term'] = test_data['term'].astype(float)
test_data['employment_length'] = pd.to_numeric(test_data['employment_length'], errors='coerce').astype(float)

# 月利と返済回数の準備
test_data['monthly_interest_rate'] = test_data['interest_rate'] / 100 / 12  # 月利に変換
test_data['term_months'] = test_data['term'] * 12  # 返済回数（月数）

# 毎月の返済額を計算する関数
def calculate_monthly_payment(loan_amnt, monthly_rate, term_months):
    if monthly_rate > 0:  # 月利が0でない場合
        return loan_amnt * (monthly_rate * (1 + monthly_rate)**term_months) / ((1 + monthly_rate)**term_months - 1)
    else:  # 月利が0の場合（単純に元金を均等割）
        return loan_amnt / term_months

# 毎月の返済額を計算
test_data['monthly_payment'] = test_data.apply(lambda row: calculate_monthly_payment(
    row['loan_amnt'], row['monthly_interest_rate'], row['term_months']
), axis=1)

# 総返済額（毎月の返済額 × 返済回数）
test_data['total_payment'] = test_data['monthly_payment'] * test_data['term_months']

# 負担指数の計算
test_data['burden_index'] = test_data['total_payment'] / test_data['credit_score']

# 結果を確認
test_data.head()


Unnamed: 0,id,loan_amnt,term,interest_rate,grade,employment_length,purpose,credit_score,application_type,monthly_interest_rate,term_months,monthly_payment,total_payment,burden_index
0,1496754,1912.5,3.0,10.16,B1,5.0,debt_consolidation,725.0,Individual,0.008467,36.0,61.854761,2226.771378,3.071409
1,84909594,1800.0,3.0,8.99,B1,9.0,credit_card,695.0,Individual,0.007492,36.0,57.231142,2060.321102,2.964491
2,1165403,550.0,3.0,14.65,C2,10.0,credit_card,660.0,Individual,0.012208,36.0,18.971798,682.984714,1.034825
3,91354446,2000.0,5.0,15.59,C5,10.0,credit_card,695.0,Individual,0.012992,60.0,48.201497,2892.089846,4.16128
4,85636932,1500.0,5.0,12.79,C1,0.0,medical,720.0,Individual,0.010658,60.0,33.968575,2038.114488,2.830715


In [15]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26906 entries, 0 to 26905
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     26906 non-null  int64  
 1   loan_amnt              26906 non-null  float64
 2   term                   26906 non-null  float64
 3   interest_rate          26906 non-null  float64
 4   grade                  26906 non-null  object 
 5   employment_length      23634 non-null  float64
 6   purpose                26906 non-null  object 
 7   credit_score           26906 non-null  float64
 8   application_type       26906 non-null  object 
 9   monthly_interest_rate  26906 non-null  float64
 10  term_months            26906 non-null  float64
 11  monthly_payment        26906 non-null  float64
 12  total_payment          26906 non-null  float64
 13  burden_index           26906 non-null  float64
dtypes: float64(10), int64(1), object(3)
memory usage: 2.9+

In [17]:
import os

# 保存先のディレクトリを指定
output_dir = "/content/drive/My Drive/signate"
os.makedirs(output_dir, exist_ok=True)  # ディレクトリが存在しない場合は作成

# 保存ファイル名を指定
output_file = os.path.join(output_dir, "test_with_features.csv")

# データセットを保存
test_data.to_csv(output_file, index=False)

print(f"データセットが保存されました: {output_file}")


データセットが保存されました: /content/drive/My Drive/signate/test_with_features.csv
