In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('D:/week6 data/data.csv')

# Aggregate features
df_agg = df.groupby('CustomerId').agg(
    total_transaction_amount=('Amount', 'sum'),
    avg_transaction_amount=('Amount', 'mean'),
    transaction_count=('Amount', 'count'),
    std_transaction_amount=('Amount', 'std')
).reset_index()

print(f"Aggregate Features:\n{df_agg.head()}")

Aggregate Features:
        CustomerId  total_transaction_amount  avg_transaction_amount  \
0     CustomerId_1                  -10000.0           -10000.000000   
1    CustomerId_10                  -10000.0           -10000.000000   
2  CustomerId_1001                   20000.0             4000.000000   
3  CustomerId_1002                    4225.0              384.090909   
4  CustomerId_1003                   20000.0             3333.333333   

   transaction_count  std_transaction_amount  
0                  1                     NaN  
1                  1                     NaN  
2                  5             6558.963333  
3                 11              560.498966  
4                  6             6030.478146  


In [3]:
# Convert TransactionStartTime to datetime
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

# Extract features
df['transaction_hour'] = df['TransactionStartTime'].dt.hour
df['transaction_day'] = df['TransactionStartTime'].dt.day
df['transaction_month'] = df['TransactionStartTime'].dt.month
df['transaction_year'] = df['TransactionStartTime'].dt.year

print(f"Extracted Features:\n{df[['TransactionStartTime', 'transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year']].head()}")


Extracted Features:
       TransactionStartTime  transaction_hour  transaction_day  \
0 2018-11-15 02:18:49+00:00                 2               15   
1 2018-11-15 02:19:08+00:00                 2               15   
2 2018-11-15 02:44:21+00:00                 2               15   
3 2018-11-15 03:32:55+00:00                 3               15   
4 2018-11-15 03:34:21+00:00                 3               15   

   transaction_month  transaction_year  
0                 11              2018  
1                 11              2018  
2                 11              2018  
3                 11              2018  
4                 11              2018  


In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One-Hot Encoding for Categorical Variables
df_encoded = pd.get_dummies(df, columns=['CurrencyCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId'])

# Label Encoding for Categorical Variables
le = LabelEncoder()
df['TransactionId'] = le.fit_transform(df['TransactionId'])
df['BatchId'] = le.fit_transform(df['BatchId'])
df['AccountId'] = le.fit_transform(df['AccountId'])
df['SubscriptionId'] = le.fit_transform(df['SubscriptionId'])
df['CustomerId'] = le.fit_transform(df['CustomerId'])

print(f"Encoded Features:\n{df.head()}")


Encoded Features:
   TransactionId  BatchId  AccountId  SubscriptionId  CustomerId CurrencyCode  \
0          78150    46980       2490            3535        2584          UGX   
1          75821    31755       3219            2366        2584          UGX   
2          39888    60272       2713             996        2806          UGX   
3          48738     1797       3351             974        3733          UGX   
4          41364    48941       3219            2366        3733          UGX   

   CountryCode    ProviderId     ProductId     ProductCategory    ChannelId  \
0          256  ProviderId_6  ProductId_10             airtime  ChannelId_3   
1          256  ProviderId_4   ProductId_6  financial_services  ChannelId_2   
2          256  ProviderId_6   ProductId_1             airtime  ChannelId_3   
3          256  ProviderId_1  ProductId_21        utility_bill  ChannelId_3   
4          256  ProviderId_4   ProductId_6  financial_services  ChannelId_2   

    Amount  Value   

In [7]:
import pandas as pd

# Load dataset
df = pd.read_csv('D:/week6 data/data.csv')

# Check for missing values
missing_values = df.isnull().sum()
print(f"Missing Values:\n{missing_values}")

# Imputation example using the recommended method
df.fillna({'Amount': df['Amount'].mean()}, inplace=True)

# Alternatively, assign the result back to the DataFrame
# df['Amount'] = df['Amount'].fillna(df['Amount'].mean())

# Check again for missing values to ensure they are filled
missing_values_after = df.isnull().sum()
print(f"Missing Values After Imputation:\n{missing_values_after}")

Missing Values:
TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64
Missing Values After Imputation:
TransactionId           0
BatchId                 0
AccountId               0
SubscriptionId          0
CustomerId              0
CurrencyCode            0
CountryCode             0
ProviderId              0
ProductId               0
ProductCategory         0
ChannelId               0
Amount                  0
Value                   0
TransactionStartTime    0
PricingStrategy         0
FraudResult             0
dtype: int64


In [6]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Normalization
scaler = MinMaxScaler()
df[['Amount', 'Value']] = scaler.fit_transform(df[['Amount', 'Value']])

# Standardization
scaler = StandardScaler()
df[['Amount', 'Value']] = scaler.fit_transform(df[['Amount', 'Value']])

print(f"Scaled Features:\n{df[['Amount', 'Value']].head()}")


Scaled Features:
     Amount     Value
0 -0.046371 -0.072291
1 -0.054643 -0.080251
2 -0.050426 -0.076352
3  0.107717  0.096648
4 -0.059704 -0.075183
