##Initialize

In [None]:
import pandas as pd

In [None]:
# prompt: Read a file from goofgle drive

from google.colab import drive
drive.mount('/content/drive')

# After mounting, you can access files in your Google Drive.
# For example, if you have a file named 'my_document.txt' in the root of your Drive:



## Combined Dataset Creation

In [None]:
test_path = '/content/drive/MyDrive/Amex/test_data.parquet'
test_data = pd.read_parquet(test_path)
test_data.head()

In [None]:
train_path = '/content/drive/MyDrive/Amex/train_sample_20p.parquet' #'/content/drive/MyDrive/Amex/train_sample.parquet'
train_data = pd.read_parquet(train_path)
train_data.head()

In [None]:
train_data.shape

In [None]:
event_path = '/content/drive/MyDrive/Amex/add_event.parquet'
event_data = pd.read_parquet(event_path)
event_data.head()

In [None]:
# First, determine which DataFrame has which type and convert accordingly
# For example, if train_data has 'id2' as object and events has 'id2' as int32:
event_data['id2'] = event_data['id2'].astype(str)  # Convert int32 to string (object)

In [None]:
# Or alternatively, if you want to convert train_data's column to int:
# train_data['id2'] = train_data['id2'].astype(int)

# Then perform the merge
train_df = pd.merge(
    train_data,
    event_data,
    on=['id2', 'id3', 'id4'],  # Columns to join on
    how='left'  # Type of join: 'inner', 'outer', 'left', 'right'
)

In [None]:
# Then perform the merge
test_df = pd.merge(
    test_data,
    event_data,
    on=['id2', 'id3', 'id4'],  # Columns to join on
    how='left'  # Type of join: 'inner', 'outer', 'left', 'right'
)

In [None]:
print(train_df.shape)

In [None]:
print(test_df.shape)

In [None]:
test_df.to_parquet(r"/content/drive/MyDrive/Amex/test_event.parquet",index = True)

In [None]:
test_path = '/content/drive/MyDrive/Amex/test_event.parquet'
test_df = pd.read_parquet(test_path)


In [None]:
test_df.shape

In [None]:
trans_path = '/content/drive/MyDrive/Amex/add_trans.parquet'
trans_data = pd.read_parquet(trans_path)
trans_data.head()

In [None]:
print(trans_data['f367'].unique())

In [None]:
trans_data['id2'] = trans_data['id2'].astype(str)  # Convert int32 to string (object)

In [None]:
# Then perform the merge

train_df = pd.merge(
    train_df,
    trans_data,
    on=['id2'],  # Columns to join on
    how='left'  # Type of join: 'inner', 'outer', 'left', 'right'
)



In [None]:
# Then perform the merge
test_df = pd.merge(
    test_df,
    trans_data,
    on=['id2'],  # Columns to join on
    how='left'  # Type of join: 'inner', 'outer', 'left', 'right'
)

In [None]:
print(list(train_df.columns))

In [None]:
print(list(test_df.columns))

In [None]:
offer_path = '/content/drive/MyDrive/Amex/offer_metadata.parquet'
offer_data = pd.read_parquet(offer_path)
offer_data.head()

In [None]:
#unique f374 values

print(train_df['f374'].unique())

In [None]:
print(test_df['f374'].unique())

In [None]:
# drop f374 from train_df

train_df = train_df.drop('f374', axis=1)


In [None]:
# drop f374 from train_df

test_df = test_df.drop('f374', axis=1)


In [None]:
print(train_df['id8'].unique())

In [None]:
train_df = train_df.drop('id8', axis=1)


In [None]:
print(test_df['id8'].unique())

In [None]:
test_df = test_df.drop('id8', axis=1)

In [None]:
print(list(train_df.columns))

In [None]:
print(list(test_df.columns))

In [None]:
print(train_df.shape)

In [None]:
print(test_df.shape)

In [None]:
offer_data['id3'] = offer_data['id3'].astype(str)  # Convert int32 to string (object)

In [None]:
# Then perform the merge

train_df = pd.merge(
    train_df,
    offer_data,
    on=['id3'],  # Columns to join on
    how='left'  # Type of join: 'inner', 'outer', 'left', 'right'
)



In [None]:
# Then perform the merge
test_df = pd.merge(
    test_df,
    offer_data,
    on=['id3'],  # Columns to join on
    how='left'  # Type of join: 'inner', 'outer', 'left', 'right'
)

In [None]:
print(list(train_df.columns))

In [None]:
print(list(test_df.columns))

In [None]:
print(train_df.shape)

In [None]:
print(test_df.shape)

In [None]:
# replace the values in 'id4' column with the hour of the day of the timestamp, the datatype of the column should be an object

train_df['id4'] = pd.to_datetime(train_df['id4']).dt.hour.astype(str)

In [None]:
test_df['id4'] = pd.to_datetime(test_df['id4']).dt.hour.astype(str)

In [None]:
train_df.to_parquet(r"/content/drive/MyDrive/Amex/train_combined_sample_20p.parquet",index = True)#(r"/content/drive/MyDrive/Amex/train_combined_sample.parquet",index = True)


In [None]:
test_df.to_parquet(r"/content/drive/MyDrive/Amex/test_combined.parquet",index = True)

## Train Dataset

In [None]:
# read the train_data using dask
train_path = '/content/drive/MyDrive/Amex/train_data.parquet'
train_data = pd.read_parquet(train_path)
train_data.head()

In [None]:
print(list(train_data.dtypes))

# Trying K-Mean Clustering

In [None]:
# List of features to convert (truncated for clarity here; use full list below)
numerical_cols = [
    'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
    'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
    'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
    'f31', 'f32', 'f33', 'f34', 'f35', 'f36', 'f37', 'f38', 'f39', 'f40',
    'f41', 'f43', 'f44', 'f45', 'f46', 'f47', 'f49', 'f51', 'f58', 'f59',
    'f60', 'f61', 'f62', 'f63', 'f64', 'f65', 'f66', 'f67', 'f68', 'f69',
    'f70', 'f71', 'f72', 'f73', 'f74', 'f75', 'f76', 'f77', 'f78', 'f79',
    'f80', 'f81', 'f82', 'f83', 'f84', 'f85', 'f86', 'f87', 'f88', 'f89',
    'f90', 'f91', 'f92', 'f93', 'f94', 'f95', 'f96', 'f97', 'f98', 'f99',
    'f100', 'f101', 'f102', 'f103', 'f104', 'f105', 'f106', 'f107', 'f108',
    'f109', 'f110', 'f111', 'f112', 'f113', 'f114', 'f115', 'f116', 'f117',
    'f118', 'f119', 'f120', 'f121', 'f122', 'f123', 'f124', 'f125', 'f126',
    'f127', 'f128', 'f129', 'f130', 'f131', 'f132', 'f133', 'f134', 'f135',
    'f136', 'f137', 'f138', 'f139', 'f140', 'f141', 'f142', 'f143', 'f144',
    'f145', 'f146', 'f147', 'f148', 'f149', 'f150', 'f151', 'f152', 'f153',
    'f154', 'f155', 'f156', 'f157', 'f158', 'f159', 'f160', 'f161', 'f162',
    'f163', 'f164', 'f165', 'f166', 'f167', 'f168', 'f169', 'f170', 'f171',
    'f172', 'f173', 'f174', 'f175', 'f176', 'f177', 'f178', 'f179', 'f180',
    'f181', 'f182', 'f183', 'f184', 'f185', 'f186', 'f187', 'f188', 'f189',
    'f190', 'f191', 'f192', 'f193', 'f194', 'f195', 'f196', 'f197', 'f198',
    'f199', 'f200', 'f201', 'f202', 'f203', 'f204', 'f205', 'f206', 'f207',
    'f208', 'f209', 'f210', 'f211', 'f212', 'f213', 'f214', 'f215', 'f216',
    'f217', 'f218', 'f219', 'f220', 'f221', 'f222', 'f223', 'f224', 'f225',
    'f310', 'f311', 'f312', 'f313', 'f314', 'f315', 'f316', 'f317', 'f318',
    'f319', 'f320', 'f321', 'f322', 'f323', 'f324', 'f325', 'f326', 'f327',
    'f328', 'f329', 'f330', 'f331', 'f332', 'f333', 'f334', 'f335', 'f336',
    'f337', 'f338', 'f339', 'f340', 'f341', 'f342', 'f343', 'f344', 'f345',
    'f346', 'f347', 'f348', 'f350', 'f351', 'f352', 'f353', 'f355', 'f356',
    'f357', 'f358', 'f359', 'f360', 'f361', 'f362', 'f363', 'f364', 'f365',
    'f366'
]


In [None]:
# Apply to train dataframes
for col in numerical_cols:
    if col in train_data.columns:
        train_data[col] = pd.to_numeric(train_data[col], errors='coerce')


In [None]:
X = train_data[numerical_cols].fillna(0)

In [None]:
from sklearn.cluster import MiniBatchKMeans

kmeans = MiniBatchKMeans(
    n_clusters=100,
    batch_size=10000,
    max_iter=100,
    random_state=42
)


In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:

# Fill missing and scale
X_scaled = StandardScaler().fit_transform(X)

# Reduce to 20 dimensions before clustering
pca = PCA(n_components=20, random_state=42)
X_reduced = pca.fit_transform(X_scaled)


In [None]:
X_small = X.sample(n=100_000, random_state=42)  # or stratified sample
X_scaled_small = StandardScaler().fit_transform(X_small.fillna(0))
X_reduced_small = PCA(n_components=20).fit_transform(X_scaled_small)



In [None]:
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(X_reduced_small)

# Assign clusters to full data (if needed)
X_full_scaled = StandardScaler().fit_transform(X.fillna(0))
X_full_reduced = PCA(n_components=20).fit_transform(X_full_scaled)


In [None]:
train_data['cluster'] = kmeans.predict(X_full_reduced)

In [None]:
train_data.head()

In [None]:
train_data.shape

#All Clicks + Sampled Non Clicks

In [None]:
train_data['y'].dtype

In [None]:
# 1. Keep all clicks (positives)
clicks = train_data[train_data['y'] == '1']

# 2. Randomly sample a fraction of non-clicks (negatives)
non_clicks_sampled = train_data[train_data['y'] == '0'].sample(frac=0.25, random_state=42)

# 3. Combine and shuffle
train_sample = pd.concat([clicks, non_clicks_sampled]).sample(frac=1.0, random_state=42).reset_index(drop=True)


In [None]:
train_sample.shape

In [None]:
train_sample.to_parquet(r"/content/drive/MyDrive/Amex/train_sample_25p.parquet",index = True)

#Stratified Sampling

In [None]:
from sklearn.model_selection import train_test_split

# Stratified sample (e.g., 10% of train_data)
train_sample, _ = train_test_split(
    train_data,
    test_size=0.80,  # or use train_size=0.10
    stratify=train_data['y'],
    random_state=42
)

print("Sampled data shape:", train_sample.shape)
print(train_sample['y'].value_counts(normalize=True))


In [None]:
train_sample.to_parquet(r"/content/drive/MyDrive/Amex/train_sample_20p.parquet",index = True)