In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
def read_data(folder_path):
    files = [f for f in os.listdir(folder_path) if f.endswith('.parquet') or f.endswith('.csv')]
    dataframes = {}
    for file in files:
        file_path = os.path.join(folder_path, file)
        if file.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file.endswith('.parquet'):
            df = pd.read_parquet(file_path)
        dataframes[file] = df
    return dataframes

In [3]:
df = pd.read_parquet(r"C:\Users\RDRL\Desktop\AmEx 25\data\train_data.parquet")

In [4]:
test_df = pd.read_parquet(r"C:\Users\RDRL\Desktop\AmEx 25\data\test_data.parquet")

In [5]:
missing_value_count = df.isnull().sum().sort_values( ascending=False)


In [6]:
# features which have more than 75% missing values
features_to_drop = missing_value_count[missing_value_count > 0.75 * len(df)].index.tolist()
print("Features with more than 75% missing values:", features_to_drop)


Features with more than 75% missing values: ['f136', 'f135', 'f112', 'f122', 'f80', 'f360', 'f120', 'f34', 'f19', 'f13', 'f21', 'f14', 'f17', 'f20', 'f15', 'f16', 'f18', 'f84', 'f37', 'f189', 'f221', 'f205', 'f154', 'f176', 'f64', 'f88', 'f66', 'f70', 'f92', 'f220', 'f33', 'f79', 'f36', 'f118', 'f114', 'f81', 'f117', 'f4', 'f121', 'f3', 'f119', 'f116', 'f218']


In [7]:
def print_feature_details(features_to_drop, data_dictionary):
    for feature in features_to_drop:
        feature_info = data_dictionary[data_dictionary['masked_column'] == feature]
        if not feature_info.empty:
            for _, row in feature_info.iterrows():
                print(f"ID: {row['masked_column']}\nDescription: {row['Description']}\nType: {row['Type']}\nMissing Values: {missing_value_count[feature]}\n{'-'*60}")
        else:
            print(f"ID: {feature} - No information available in data dictionary.\n{'-'*60}")

In [8]:
# Rearrange the features in ascending order of missing values in a new DataFrame
sorted_columns = missing_value_count.sort_values(ascending=True).index.tolist()
df_sorted = df[sorted_columns]
df_sorted.sample(10)

Unnamed: 0,id1,id2,id3,id4,id5,y,f350,f349,f307,f306,...,f13,f19,f34,f120,f360,f80,f122,f112,f135,f136
341509,1622670_13411_16-23_2023-11-02 11:50:09.830,1622670,13411,2023-11-02 11:50:09.830,2023-11-02,0,42594,5,0.0,0.0,...,,,,,,,,,,
657684,1244050_61621_16-23_2023-11-01 10:37:37.945,1244050,61621,2023-11-01 10:37:37.945,2023-11-01,0,38173,4,0.0,0.0,...,,,,,,,,,,
697474,1255719_170747_16-23_2023-11-01 06:33:41.858,1255719,170747,2023-11-01 06:33:41.858,2023-11-01,0,23608,4,0.0,0.0,...,,,,,,,,,,
333630,1733455_26744_16-23_2023-11-02 17:08:02.102,1733455,26744,2023-11-02 17:08:02.102,2023-11-02,0,61618,5,0.0,0.0,...,,,,,,,,,,
663292,1847359_87102_16-23_2023-11-02 10:37:41.688,1847359,87102,2023-11-02 10:37:41.688,2023-11-02,0,38260,5,0.0,0.0,...,,,,,,,,,,
590710,1559996_72717_16-23_2023-11-03 09:57:56.065,1559996,72717,2023-11-03 09:57:56.065,2023-11-03,0,35800,6,0.0,0.0,...,,,,,,,,,,
677808,1595194_90178_16-23_2023-11-01 17:08:21.761,1595194,90178,2023-11-01 17:08:21.761,2023-11-01,0,61615,4,0.0,0.0,...,,,,,,,,,,
633192,1751822_606173_16-23_2023-11-02 03:48:57.798,1751822,606173,2023-11-02 03:48:57.798,2023-11-02,0,13656,5,0.0,0.0,...,,,,,,,,,,
325769,1004166_82883_16-23_2023-11-02 14:15:12.541,1004166,82883,2023-11-02 14:15:12.541,2023-11-02,0,51277,5,0.0,0.0,...,,,,,,,,,,
311409,1228043_34826_16-23_2023-11-01 10:55:47.000429,1228043,34826,2023-11-01 10:55:47.000429,2023-11-01,1,39322,4,0.0,0.0,...,,,,,,,,,,


In [9]:
def impute_missing_values(df, strategy='median'):
    df_imputed = df.copy()
    for column in df_imputed.columns:
        if df_imputed[column].isnull().any():
            if df_imputed[column].dtype == 'float64':
                if strategy == 'mean':
                    df_imputed[column].fillna(df_imputed[column].mean(), inplace=True)
                elif strategy == 'median':
                    df_imputed[column].fillna(df_imputed[column].median(), inplace=True)
    return df_imputed

In [10]:
# converting the column of df to respective dtypes as mentioned in the data dictionary
def convert_column_types(df, data_dictionary):
    type_map = {
        'Numerical': 'float64',
        'Categorical': 'category',
        'Key': 'category',
        'Label': 'category',
        'One hot encoded': 'category',
        '-': 'object',
    }
    for column in df.columns:
        if column in data_dictionary['masked_column'].values:
            dtype = data_dictionary.loc[data_dictionary['masked_column'] == column, 'Type'].iloc[0]
            if dtype in type_map:
                try:
                    df[column] = df[column].astype(type_map[dtype])
                except Exception as e:
                    print(f"Could not convert {column} to {type_map[dtype]}: {e}")
    return df

In [11]:
def print_object_columns(df):
    dtype_groups = {}
    for col in df.columns:
        dtype_str = str(df[col].dtype)
        dtype_groups.setdefault(dtype_str, []).append(col)
    for dtype, cols in dtype_groups.items():
        print(f"\nColumns with dtype '{dtype}' ({len(cols)} columns):")
        for col in cols:
            print(f"  - {col}")

In [12]:
data_dictionary = pd.read_csv(r"C:\Users\RDRL\Desktop\AmEx 25\details\data_dictionary.csv")

In [13]:
# rearranging in ascending order of missing values less than 25%
sorted_columns = missing_value_count[missing_value_count < 0.25 * len(df)].sort_values(ascending=True).index.tolist()
df_25 = df[sorted_columns]

In [14]:
df_25 = convert_column_types(df_25, data_dictionary)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(type_map[dtype])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(type_map[dtype])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(type_map[dtype])
A value is trying to be set on a copy of a slice from a DataF

Could not convert id4 to float64: could not convert string to float: '2023-11-02 22:22:00.042'
Could not convert id5 to float64: could not convert string to float: '2023-11-02'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(type_map[dtype])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(type_map[dtype])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column] = df[column].astype(type_map[dtype])
A value is trying to be set on a copy of a slice from a DataF

In [15]:
df_25.dtypes.value_counts()

float64     160
category     48
category     36
object        3
category      2
category      1
category      1
category      1
category      1
category      1
Name: count, dtype: int64

In [16]:
print_object_columns(df_25)


Columns with dtype 'category' (91 columns):
  - f349
  - id1
  - id3
  - y
  - f226
  - f227
  - f229
  - f230
  - f266
  - f265
  - f264
  - f263
  - f262
  - f261
  - f260
  - f259
  - f282
  - f281
  - f280
  - f279
  - f278
  - f277
  - f276
  - f275
  - f267
  - f268
  - f269
  - f270
  - f271
  - f304
  - f307
  - f306
  - f294
  - f293
  - f285
  - f297
  - f298
  - f299
  - f308
  - f309
  - f295
  - f296
  - f288
  - f287
  - f286
  - f292
  - f289
  - f290
  - f274
  - f273
  - f272
  - f291
  - f250
  - f305
  - f302
  - f303
  - f239
  - f240
  - f241
  - f242
  - f243
  - f244
  - f245
  - f246
  - f231
  - f232
  - f233
  - f234
  - f235
  - f228
  - f237
  - f238
  - f247
  - f248
  - f249
  - f284
  - f283
  - f258
  - f257
  - f256
  - f255
  - f254
  - f252
  - f236
  - f251
  - f300
  - f253
  - f301
  - f354
  - f52
  - f50

Columns with dtype 'float64' (160 columns):
  - f350
  - f335
  - f223
  - f225
  - f224
  - f333
  - f332
  - f334
  - f201
  - f203
  - f204

In [17]:
missing_cols = df_25.columns[df_25.isnull().any()]
dtype_counts = df_25[missing_cols].dtypes.value_counts()
print("Number of columns with missing values grouped by dtype in df_25:")
print(dtype_counts)

Number of columns with missing values grouped by dtype in df_25:
float64     159
category     48
category     36
category      2
category      1
Name: count, dtype: int64


In [18]:
df_25_imputed = impute_missing_values(df_25, strategy='median')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[column].fillna(df_imputed[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imputed[column].fillna(df_imputed[column].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [19]:
missing_cols_imputed = df_25_imputed.columns[df_25_imputed.isnull().any()]
dtype_counts_imputed = df_25_imputed[missing_cols_imputed].dtypes.value_counts()
print("Number of columns with missing values grouped by dtype in df_25:")
print(dtype_counts_imputed)

Number of columns with missing values grouped by dtype in df_25:
category    48
category    36
category     2
category     1
Name: count, dtype: int64


In [20]:

df_25_imputed.isnull().sum().sort_values(ascending=False)[:90]

f50     190663
f52     190663
f354    141991
f281        18
f226        18
         ...  
f302        18
f239        18
y            0
id3          0
id4          0
Length: 90, dtype: int64

In [21]:
def impute_one_hot_encoded_with_median(df, data_dictionary):
    skip_cols = {"f50", "f52", "f354"}
    one_hot_cols = data_dictionary[data_dictionary['Type'].str.lower().str.contains('one hot encoded')]['masked_column'].tolist()
    
    cols_to_impute = [
        col for col in one_hot_cols
        if col in df.columns and col not in skip_cols and df[col].isnull().any()
    ]
    
    df_imputed = df.copy()
    
    for col in cols_to_impute:
        # Coerce to numeric if needed
        df_imputed[col] = pd.to_numeric(df_imputed[col], errors='coerce')
        median_val = df_imputed[col].median()
        if pd.notna(median_val):
            df_imputed[col] = df_imputed[col].fillna(median_val)
        else:
            print(f"⚠️ Skipping column {col} — median is NaN (possibly all missing)")
    
    return df_imputed


In [22]:
df_imputed_all = impute_one_hot_encoded_with_median(df_25_imputed, data_dictionary)

In [23]:
df_imputed_all

Unnamed: 0,f349,f350,id1,id2,id3,id4,id5,y,f335,f223,...,f41,f47,f51,f44,f49,f52,f50,f46,f45,f313
0,5,80458.0,1366776_189706075_16-23_2023-11-02 22:22:00.042,1366776,189706075,2023-11-02 22:22:00.042,2023-11-02,0,0.0,1.0,...,602.15,7613.0,3166.0,0.0,0.0,,,6277.0,0.0,0.061555
1,4,85874.0,1366776_89227_16-23_2023-11-01 23:51:24.999,1366776,89227,2023-11-01 23:51:24.999,2023-11-01,0,0.0,-1.0,...,602.15,7613.0,3166.0,0.0,0.0,,,6277.0,0.0,0.061555
2,4,1855.0,1366776_35046_16-23_2023-11-01 00:30:59.797,1366776,35046,2023-11-01 00:30:59.797,2023-11-01,0,0.0,-2.0,...,602.15,7613.0,3166.0,0.0,0.0,,,6277.0,0.0,0.061555
3,5,80458.0,1366776_6275451_16-23_2023-11-02 22:21:32.261,1366776,6275451,2023-11-02 22:21:32.261,2023-11-02,0,0.0,1.0,...,602.15,7613.0,3166.0,0.0,0.0,,,6277.0,0.0,0.061555
4,5,80458.0,1366776_78053_16-23_2023-11-02 22:21:34.799,1366776,78053,2023-11-02 22:21:34.799,2023-11-02,0,0.0,1.0,...,602.15,7613.0,3166.0,0.0,0.0,,,6277.0,0.0,0.061555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
770159,5,29659.0,1896641_87731_16-23_2023-11-02 08:14:21.524,1896641,87731,2023-11-02 08:14:21.524,2023-11-02,0,0.0,29.0,...,68.02,43867.0,8518.0,0.0,0.0,N,N,38251.0,8.0,0.054501
770160,5,29659.0,1896641_505604_16-23_2023-11-02 08:14:24.458,1896641,505604,2023-11-02 08:14:24.458,2023-11-02,0,0.0,60.0,...,68.02,43867.0,8518.0,0.0,0.0,N,N,38251.0,8.0,0.055537
770161,5,29659.0,1896641_25212_16-23_2023-11-02 08:14:25.748,1896641,25212,2023-11-02 08:14:25.748,2023-11-02,0,0.0,60.0,...,68.02,43867.0,8518.0,0.0,0.0,N,N,38251.0,8.0,0.057477
770162,5,22103.0,1900765_95157_16-23_2023-11-02 06:08:25.900,1900765,95157,2023-11-02 06:08:25.900,2023-11-02,0,0.0,-1.0,...,102.50,7613.0,3166.0,0.0,0.0,,,6277.0,0.0,0.061555


In [24]:
missing_cols_imputed = df_imputed_all.columns[df_imputed_all.isnull().any()]
dtype_counts_imputed = df_imputed_all[missing_cols_imputed].dtypes.value_counts()
print("Number of columns with missing values grouped by dtype in df_25:")
print(dtype_counts_imputed)

Number of columns with missing values grouped by dtype in df_25:
category    2
category    1
Name: count, dtype: int64


In [25]:
df_imputed_all.isnull().sum().sort_values(ascending=False)[:90]

f50     190663
f52     190663
f354    141991
id1          0
f349         0
         ...  
f334         0
f284         0
f283         0
f258         0
f257         0
Length: 90, dtype: int64

In [26]:
import seaborn as sns

In [30]:
df_imputed_all = df_imputed_all.select_dtypes(include=['float64', 'int64'])

MemoryError: Unable to allocate 1.40 GiB for an array with shape (244, 770164) and data type float64

In [None]:
df_imputed_all.isnull().sum().sort_values(ascending=False)[:90]

In [28]:
# correlation plot cluster heatmap function
# def plot_correlation_heatmap(df, title='Correlation Heatmap'):
#     corr = df.corr()
#     plt.figure(figsize=(10, 8))
#     sns.heatmap(corr, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
#     plt.title(title)
#     plt.show()

# # correlation heatmap for the DataFrame with imputed values (df_imputed_all)
# if df_imputed_all.empty:
#     print("DataFrame is empty after imputation.")
# else:
#     print("Correlation Heatmap for DataFrame with Imputed Values:")
#     plot_correlation_heatmap(df_imputed_all, title='Correlation Heatmap of Imputed DataFrame')
    
# plot_correlation_heatmap(df_imputed_all, title='Correlation Heatmap of Imputed DataFrame')
