<a href="https://colab.research.google.com/github/atrabaja/MachineLearning_FinMark/blob/main/MS1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [137]:
import os
data_path = '/content/drive/MyDrive'

In [138]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [139]:
# Define file paths
customers_file_path = '/content/drive/My Drive/customers_data.csv'
products_file_path = '/content/drive/My Drive/products_data.csv'
transactions_file_path = '/content/drive/My Drive/transactions_data.csv'

In [140]:
# Load datasets
customers_data = pd.read_csv(customers_file_path)
products_data = pd.read_csv(products_file_path)
transactions_data = pd.read_csv(transactions_file_path)

# Display the first few rows of each dataset
print(customers_data.head())
print(products_data.head())
print(transactions_data.head())

   Company_ID          Company_Name  Company_Profit  \
0         1.0  Tech  Enterprises  1         80701.0   
1         2.0   Global  Partners  2         80511.0   
2         3.0  Quantum Associates 3        110664.0   
3         4.0       Prime Network 4             NaN   
4         5.0    Elite  Ventures  5         69427.0   

                                             Address  
0             EDSA, Barangay 606, Pasig, Philippines  
1  Commonwealth Ave, Barangay 789, Taguig, Philip...  
2       Roxas Blvd, Barangay 505, Pasig, Philippines  
3  Alabang-Zapote Rd, Barangay 202, Taguig, Phili...  
4    Ayala Avenue, Barangay 101, Makati, Philippines  
   Product_ID            Product_Name Product_Price
0         1.0      FinPredictor Suite      ?140,000
1         2.0  MarketMinder Analytics      ?168,000
2         3.0    TrendWise Forecaster      ?100,800
3         4.0  CustomerScope Insights      ?123,200
4         5.0     SalesSync Optimizer       ?84,000
   Unnamed: 0  Transaction_

In [141]:
# Preprocessing Customers Data
# Identify rows with missing Company_ID
missing_ids = customers_data[customers_data['Company_ID'].isnull()]

# Map existing Company_IDs to rows with missing IDs based on matching columns
customers_data['Company_ID'] = customers_data['Company_ID'].fillna(
    customers_data.merge(
        customers_data[['Company_ID', 'Company_Name', 'Address', 'Company_Profit']].dropna(subset=['Company_ID']),
        on=['Company_Name', 'Address', 'Company_Profit'],
        how='left'
    )['Company_ID_y']
)

In [142]:
# Find the maximum existing Company_ID
max_id = customers_data['Company_ID'].max()

# Assign unique IDs to remaining missing rows
customers_data.loc[customers_data['Company_ID'].isnull(), 'Company_ID'] = range(
    int(max_id) + 1, int(max_id) + 1 + customers_data['Company_ID'].isnull().sum()
)

# Convert Company_ID to integer type
customers_data['Company_ID'] = customers_data['Company_ID'].astype(int)

In [143]:
# Check for duplicates based on Company_Name, Address, and Company_Profit
duplicates = customers_data[customers_data.duplicated(subset=['Company_Name', 'Address', 'Company_Profit'], keep=False)]
print(duplicates)

# Drop duplicates if necessary
customers_data = customers_data.drop_duplicates(subset=['Company_Name', 'Address', 'Company_Profit'], keep='first')

Empty DataFrame
Columns: [Company_ID, Company_Name, Company_Profit, Address]
Index: []


In [144]:
# Verify there are no missing values or duplicates
print(customers_data.isnull().sum())
print(customers_data.duplicated().sum())

# Display dataset summary
print(customers_data.info())
print(customers_data.head())

Company_ID         0
Company_Name       0
Company_Profit    12
Address            0
dtype: int64
0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Company_ID      100 non-null    int64  
 1   Company_Name    100 non-null    object 
 2   Company_Profit  88 non-null     float64
 3   Address         100 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 3.3+ KB
None
   Company_ID          Company_Name  Company_Profit  \
0           1  Tech  Enterprises  1         80701.0   
1           2   Global  Partners  2         80511.0   
2           3  Quantum Associates 3        110664.0   
3           4       Prime Network 4             NaN   
4           5    Elite  Ventures  5         69427.0   

                                             Address  
0             EDSA, Barangay 606, Pasig, Philippines  
1  Commonwealth Ave,

In [145]:
# Remove numbers from the Company_Name column
customers_data['Company_Name'] = customers_data['Company_Name'].str.replace(r'\d+', '', regex=True).str.strip()

# Verify the changes
print(customers_data['Company_Name'].head())

0     Tech  Enterprises
1      Global  Partners
2    Quantum Associates
3         Prime Network
4       Elite  Ventures
Name: Company_Name, dtype: object


In [146]:
# Fill missing Company_Profit based on Address group median
customers_data['Company_Profit'] = customers_data.groupby('Address')['Company_Profit'].transform(
    lambda x: x.fillna(x.median())
)

In [147]:
# Verify no missing values remain
print(customers_data.isnull().sum())

# Display updated dataset
print(customers_data.head())

Company_ID         0
Company_Name       0
Company_Profit    12
Address            0
dtype: int64
   Company_ID        Company_Name  Company_Profit  \
0           1   Tech  Enterprises         80701.0   
1           2    Global  Partners         80511.0   
2           3  Quantum Associates        110664.0   
3           4       Prime Network             NaN   
4           5     Elite  Ventures         69427.0   

                                             Address  
0             EDSA, Barangay 606, Pasig, Philippines  
1  Commonwealth Ave, Barangay 789, Taguig, Philip...  
2       Roxas Blvd, Barangay 505, Pasig, Philippines  
3  Alabang-Zapote Rd, Barangay 202, Taguig, Phili...  
4    Ayala Avenue, Barangay 101, Makati, Philippines  


In [148]:
# Cleaned customer data
customers_data.to_csv('/content/drive/MyDrive/cleaned_customers_data.csv', index=False)

In [149]:
# Preprocessing Transactions Data
# Check basic info
print(transactions_data.info())
print(transactions_data.head())

# Check for missing values
print(transactions_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        9000 non-null   float64
 1   Transaction_ID    9000 non-null   float64
 2   Company_ID        9000 non-null   float64
 3   Product_ID        9000 non-null   float64
 4   Quantity          9000 non-null   float64
 5   Transaction_Date  10000 non-null  object 
 6   Product_Price     9000 non-null   float64
 7   Total_Cost        9000 non-null   float64
dtypes: float64(7), object(1)
memory usage: 625.1+ KB
None
   Unnamed: 0  Transaction_ID  Company_ID  Product_ID  Quantity  \
0         0.0             1.0        88.0         6.0       NaN   
1         1.0             2.0        29.0        19.0      16.0   
2         2.0             NaN        28.0        18.0       6.0   
3         3.0             4.0        85.0        12.0      12.0   
4         4.0             5.0

In [150]:
# Handle Missing Values - Quality
transactions_data['Quantity'] = transactions_data['Quantity'].fillna(transactions_data['Quantity'].median())

In [154]:
# Handle Missing Values - Product Price
transactions_data['Product_Price'] = transactions_data['Product_Price'].fillna(transactions_data['Product_Price'].median())

In [158]:
# Display unique formats in Transaction_Date
print("Unique formats in Transaction_Date before conversion:\n", transactions_data['Transaction_Date'].unique())

# Define a function to parse different date formats
def parse_date(date_str):
    # Attempt to parse several known formats
    for fmt in ('%Y-%m-%d', '%d-%m-%Y', '%m-%d-%Y', '%Y/%m/%d', '%d/%m/%Y', '%m/%d/%Y', '%B %d, %Y', '%d %B %Y'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except (ValueError, TypeError):
            continue  # Try the next format if this one fails
    return pd.NaT  # Return NaT if no formats worked

# Apply the parsing function
transactions_data['Transaction_Date'] = transactions_data['Transaction_Date'].apply(parse_date)

# Check for NaT values after parsing
nat_count = transactions_data['Transaction_Date'].isna().sum()
print("NaT values in Transaction_Date column after conversion:", nat_count)

# Inspect problematic entries if there are any NaT values
if nat_count > 0:
    print("Remaining problematic date entries:\n", transactions_data[transactions_data['Transaction_Date'].isna()]['Transaction_Date'])

# Confirm the data type of Transaction_Date after all conversions
print("Data type of Transaction_Date after final conversion:", transactions_data['Transaction_Date'].dtypes)

Unique formats in Transaction_Date before conversion:
 <DatetimeArray>
['2024-03-26 00:00:00', '2024-07-09 00:00:00', '2024-04-13 00:00:00',
 '2023-06-09 00:00:00', '2021-06-07 00:00:00', '2021-07-12 00:00:00',
 '2023-11-01 00:00:00', '2023-10-31 00:00:00', '2024-02-25 00:00:00',
 '2021-03-29 00:00:00',
 ...
 '2022-11-30 00:00:00', '2024-03-16 00:00:00', '2020-06-11 00:00:00',
 '2020-01-12 00:00:00', '2022-07-13 00:00:00', '2024-11-08 00:00:00',
 '2024-12-04 00:00:00', '2023-11-04 00:00:00', '2020-03-12 00:00:00',
 '2024-12-09 00:00:00']
Length: 1490, dtype: datetime64[ns]
NaT values in Transaction_Date column after conversion: 0
Data type of Transaction_Date after final conversion: datetime64[ns]


In [159]:
# Final check of the dataset
print(transactions_data.info())
print(transactions_data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Unnamed: 0        9000 non-null   float64       
 1   Transaction_ID    9000 non-null   float64       
 2   Company_ID        9000 non-null   float64       
 3   Product_ID        9000 non-null   float64       
 4   Quantity          10000 non-null  float64       
 5   Transaction_Date  10000 non-null  datetime64[ns]
 6   Product_Price     10000 non-null  float64       
 7   Total_Cost        9000 non-null   float64       
dtypes: datetime64[ns](1), float64(7)
memory usage: 625.1 KB
None
   Unnamed: 0  Transaction_ID  Company_ID  Product_ID  Quantity  \
0         0.0             1.0        88.0         6.0      11.0   
1         1.0             2.0        29.0        19.0      16.0   
2         2.0             NaN        28.0        18.0       6.0   
3         3.0      

In [160]:
# Verify Missing Values
print(transactions_data.isnull().sum())

Unnamed: 0          1000
Transaction_ID      1000
Company_ID          1000
Product_ID          1000
Quantity               0
Transaction_Date       0
Product_Price          0
Total_Cost          1000
dtype: int64


In [161]:
# Dropped unnamed column
transactions_data = transactions_data.drop(columns=['Unnamed: 0'])

In [162]:
# Identify rows with missing Transaction_ID
missing_transaction_ids = transactions_data[transactions_data['Transaction_ID'].isnull()]
print(missing_transaction_ids)

      Transaction_ID  Company_ID  Product_ID  Quantity Transaction_Date  \
2                NaN        28.0        18.0       6.0       2024-04-13   
17               NaN        57.0        10.0       6.0       2022-07-10   
21               NaN        25.0        18.0       2.0       2021-03-21   
25               NaN        93.0         9.0      11.0       2021-09-21   
55               NaN         NaN        11.0      16.0       2023-03-08   
...              ...         ...         ...       ...              ...   
9981             NaN         NaN         5.0      11.0       2022-03-29   
9984             NaN        93.0        19.0      11.0       2021-03-25   
9995             NaN         NaN        10.0      11.0       2022-06-05   
9997             NaN        90.0         1.0      15.0       2022-07-19   
9999             NaN        77.0        16.0      14.0       2023-10-29   

      Product_Price  Total_Cost  
2     126095.547778    940800.0  
17    131297.783516    627200.0

In [163]:
# Assign unique identifiers
# Find the maximum existing Transaction_ID
max_transaction_id = transactions_data['Transaction_ID'].max()

# Assign unique IDs to missing Transaction_IDs
transactions_data.loc[transactions_data['Transaction_ID'].isnull(), 'Transaction_ID'] = range(
    int(max_transaction_id) + 1,
    int(max_transaction_id) + 1 + transactions_data['Transaction_ID'].isnull().sum()
)

# Convert Transaction_ID to integer type (if necessary)
transactions_data['Transaction_ID'] = transactions_data['Transaction_ID'].astype(int)

# Verify updates
print(transactions_data['Transaction_ID'].isnull().sum())

0


In [165]:
# Identify rows with missing Company ID
missing_company_id = transactions_data[transactions_data['Company_ID'].isnull()]
print("Rows with missing Company_ID:\n", missing_company_id)

Rows with missing Company_ID:
       Transaction_ID  Company_ID  Product_ID  Quantity Transaction_Date  \
6                  7         NaN         NaN      14.0       2023-11-01   
11                12         NaN         1.0      17.0       2022-04-05   
12                13         NaN        12.0      15.0       2023-02-27   
26                27         NaN        14.0      18.0       2023-03-01   
36                37         NaN         8.0      17.0       2023-03-16   
...              ...         ...         ...       ...              ...   
9962            9963         NaN        13.0       3.0       2023-03-14   
9965            9966         NaN         NaN       2.0       2022-11-13   
9972            9973         NaN         NaN      11.0       2021-04-20   
9981           10995         NaN         5.0      11.0       2022-03-29   
9995           10997         NaN        10.0      11.0       2022-06-05   

      Product_Price  Total_Cost  
6     133548.749710   1960000.0  
