In [1]:
# Import necessary libraries (basic version)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Set random state for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# Configure matplotlib
plt.style.use('default')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Basic libraries imported successfully!")
print("Available libraries: pandas, numpy, matplotlib")
print("Note: This version uses basic implementations of ML algorithms")


Basic libraries imported successfully!
Available libraries: pandas, numpy, matplotlib
Note: This version uses basic implementations of ML algorithms


In [2]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv('../MachineLearningRating_v3.txt', delimiter='|', low_memory=False)

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nDataset Info:")
print(f"Total records: {len(df):,}")
print(f"Total features: {len(df.columns)}")
print(f"\nTarget variables analysis:")
print(f"TotalClaims - Range: {df['TotalClaims'].min():.2f} to {df['TotalClaims'].max():.2f}")
print(f"TotalClaims - Mean: {df['TotalClaims'].mean():.2f}")
print(f"Records with claims > 0: {(df['TotalClaims'] > 0).sum():,} ({(df['TotalClaims'] > 0).mean()*100:.2f}%)")
print(f"\nTotalPremium - Range: {df['TotalPremium'].min():.2f} to {df['TotalPremium'].max():.2f}")
print(f"TotalPremium - Mean: {df['TotalPremium'].mean():.2f}")


Loading dataset...
Dataset shape: (1000098, 52)
Memory usage: 2377.38 MB

Dataset Info:
Total records: 1,000,098
Total features: 52

Target variables analysis:
TotalClaims - Range: -12002.41 to 393092.11
TotalClaims - Mean: 64.86
Records with claims > 0: 2,788 (0.28%)

TotalPremium - Range: -782.58 to 65282.60
TotalPremium - Mean: 61.91


In [4]:
# Load the dataset
print("Loading dataset...")
df = pd.read_csv('../MachineLearningRating_v3.txt', delimiter='|', low_memory=False)

print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\nFirst few rows:")
print(df.head())

Loading dataset...
Dataset shape: (1000098, 52)
Memory usage: 2377.38 MB

First few rows:
   UnderwrittenCoverID  PolicyID     TransactionMonth  IsVATRegistered  \
0               145249     12827  2015-03-01 00:00:00             True   
1               145249     12827  2015-05-01 00:00:00             True   
2               145249     12827  2015-07-01 00:00:00             True   
3               145255     12827  2015-05-01 00:00:00             True   
4               145255     12827  2015-07-01 00:00:00             True   

  Citizenship          LegalType Title Language                 Bank  \
0              Close Corporation    Mr  English  First National Bank   
1              Close Corporation    Mr  English  First National Bank   
2              Close Corporation    Mr  English  First National Bank   
3              Close Corporation    Mr  English  First National Bank   
4              Close Corporation    Mr  English  First National Bank   

       AccountType  ...         