#### Step 1: Import the required libraries


In [253]:
# Step 1.1: Linear algebra
import numpy as np

# Step 1.2: Data processing
import pandas as pd

# Step 1.3: For dimensionality reduction
from sklearn.decomposition import PCA

#### Step 2: Read the data from train.csv

In [254]:
# Assuming your file is named 'train.csv'
file_path = './train.csv'
data = pd.read_csv(file_path)

# Step 2.1: Let us understand the data
# Display basic information about the dataset
print("Dataset Info:\n")
print(data.info())

# Display summary statistics
print("\nSummary Statistics:\n", data.describe())

# Display the unique values in each column
print("\nUnique Values in Each Column:\n", data.nunique())

# Step 2.2: Print a few rows and see how the data looks like
print("\nFirst 5 Rows of the Data:\n", data.head())

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB
None

Summary Statistics:
                 ID            y          X10     X11          X12  \
count  4209.000000  4209.000000  4209.000000  4209.0  4209.000000   
mean   4205.960798   100.669318     0.013305     0.0     0.075077   
std    2437.608688    12.679381     0.114590     0.0     0.263547   
min       0.000000    72.110000     0.000000     0.0     0.000000   
25%    2095.000000    90.820000     0.000000     0.0     0.000000   
50%    4220.000000    99.150000     0.000000     0.0     0.000000   
75%    6314.000000   109.010000     0.000000     0.0     0.000000   
max    8417.000000   265.320000     1.000000     0.0     1.000000   

               X13          X14          X15          X16          X17  ...  \
count  4209.000000  4209.000000  4209.000000  4209.000000  4209.000000  ...   
mean  

#### Step 3: Collect the Y values into an array

In [255]:
# Step 3.1: Separate the y from the data

# Assuming your target variable (Y) is named 'target_column'
target_column = 'y'  # Replace with your actual target column name

# Separate the target variable from the rest of the data
y = data[target_column]

# Drop the target column from the original data
X = data.drop(columns=[target_column, 'ID'])

# Display the shape of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (4209, 376)
Shape of y: (4209,)


#### Step 4: Understand the data types we have

In [256]:
# Step 4.1: Iterate through all the columns which has X in the name of the column

# Filter columns with 'X' in the name
x_columns = [col for col in X.columns if 'X' in col]

# Iterate through the selected columns and print their data types
print("Data Types of Columns with 'X':")
for col in x_columns:
    print(f"{col}: {X[col].dtype}")

Data Types of Columns with 'X':
X0: object
X1: object
X2: object
X3: object
X4: object
X5: object
X6: object
X8: object
X10: int64
X11: int64
X12: int64
X13: int64
X14: int64
X15: int64
X16: int64
X17: int64
X18: int64
X19: int64
X20: int64
X21: int64
X22: int64
X23: int64
X24: int64
X26: int64
X27: int64
X28: int64
X29: int64
X30: int64
X31: int64
X32: int64
X33: int64
X34: int64
X35: int64
X36: int64
X37: int64
X38: int64
X39: int64
X40: int64
X41: int64
X42: int64
X43: int64
X44: int64
X45: int64
X46: int64
X47: int64
X48: int64
X49: int64
X50: int64
X51: int64
X52: int64
X53: int64
X54: int64
X55: int64
X56: int64
X57: int64
X58: int64
X59: int64
X60: int64
X61: int64
X62: int64
X63: int64
X64: int64
X65: int64
X66: int64
X67: int64
X68: int64
X69: int64
X70: int64
X71: int64
X73: int64
X74: int64
X75: int64
X76: int64
X77: int64
X78: int64
X79: int64
X80: int64
X81: int64
X82: int64
X83: int64
X84: int64
X85: int64
X86: int64
X87: int64
X88: int64
X89: int64
X90: int64
X91: int64


#### Step5: Count the data in each of the columns

In [257]:
# Display the count of non-null values in each column
column_counts = X.count()
print("Count of Data in Each Column:")
print(column_counts)

Count of Data in Each Column:
X0      4209
X1      4209
X2      4209
X3      4209
X4      4209
        ... 
X380    4209
X382    4209
X383    4209
X384    4209
X385    4209
Length: 376, dtype: int64


#### Step 6: Read the test.csv data

In [258]:
# Assuming your file is named 'test.csv'
test_file_path = './test.csv'
test_data = pd.read_csv(test_file_path)

# Step 6.1: Remove columns ID and Y from the data
# Assuming 'ID' and 'Y' are the columns to be removed
columns_to_remove = ['ID', 'Y']
test_data_cleaned = test_data.drop(columns=columns_to_remove, errors='ignore')

# Display the first few rows of the cleaned test data
print("First 5 Rows of Cleaned Test Data:\n", test_data_cleaned.head())

First 5 Rows of Cleaned Test Data:
    X0 X1  X2 X3 X4 X5 X6 X8  X10  X11  ...  X375  X376  X377  X378  X379  \
0  az  v   n  f  d  t  a  w    0    0  ...     0     0     0     1     0   
1   t  b  ai  a  d  b  g  y    0    0  ...     0     0     1     0     0   
2  az  v  as  f  d  a  j  j    0    0  ...     0     0     0     1     0   
3  az  l   n  f  d  z  l  n    0    0  ...     0     0     0     1     0   
4   w  s  as  c  d  y  i  m    0    0  ...     1     0     0     0     0   

   X380  X382  X383  X384  X385  
0     0     0     0     0     0  
1     0     0     0     0     0  
2     0     0     0     0     0  
3     0     0     0     0     0  
4     0     0     0     0     0  

[5 rows x 376 columns]


#### Step 7: Check for null and unique values for test and train sets

In [259]:
# Check for null values in the training set
print("Null Values in Train Set:\n", X.isnull().sum())

# Check for unique values in the training set
print("\nUnique Values in Train Set:\n", X.nunique())

# Check for null values in the test set
print("\nNull Values in Test Set:\n", test_data_cleaned.isnull().sum())

# Check for unique values in the test set
print("\nUnique Values in Test Set:\n", test_data_cleaned.nunique())

Null Values in Train Set:
 X0      0
X1      0
X2      0
X3      0
X4      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 376, dtype: int64



Unique Values in Train Set:
 X0      47
X1      27
X2      44
X3       7
X4       4
        ..
X380     2
X382     2
X383     2
X384     2
X385     2
Length: 376, dtype: int64

Null Values in Test Set:
 X0      0
X1      0
X2      0
X3      0
X4      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 376, dtype: int64

Unique Values in Test Set:
 X0      49
X1      27
X2      45
X3       7
X4       4
        ..
X380     2
X382     2
X383     2
X384     2
X385     2
Length: 376, dtype: int64


#### Step 8: If for any column(s), the variance is equal to zero, then remove those variable(s)

In [260]:
# Step 8: Identify and remove columns with zero variance in the training set
numeric_columns_train = X.select_dtypes(include=np.number).columns
zero_variance_columns_train = X[numeric_columns_train].columns[X[numeric_columns_train].var() == 0]
X = X.drop(columns=zero_variance_columns_train)
print("Columns with Zero Variance Removed in Training Set:\n", zero_variance_columns_train)

# Identify and remove the same columns with zero variance in the test set
test_data_cleaned = test_data_cleaned.drop(columns=zero_variance_columns_train)
print("Columns with Zero Variance Removed in Test Set:\n", zero_variance_columns_train)

Columns with Zero Variance Removed in Training Set:
 Index(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293',
       'X297', 'X330', 'X347'],
      dtype='object')
Columns with Zero Variance Removed in Test Set:
 Index(['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293',
       'X297', 'X330', 'X347'],
      dtype='object')


In [261]:
# Assuming you want to apply label encoding to categorical columns
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns for label encoding in the training set
categorical_columns = X.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()

# Concatenate the training and test sets for consistent label encoding
combined_data = pd.concat([X, test_data_cleaned], axis=0)

# Reset index of the combined_data DataFrame
combined_data.reset_index(drop=True, inplace=True)

# Apply label encoding to categorical columns in the combined dataset
for col in categorical_columns:
    combined_data[col] = label_encoder.fit_transform(combined_data[col])

# Apply label encoding to categorical columns in the training set
for col in categorical_columns:
    X[col] = combined_data.loc[X.index, col]

# Apply label encoding to categorical columns in the test set
for col in categorical_columns:
    test_data_cleaned[col] = combined_data.loc[test_data_cleaned.index, col]

# Display the first few rows of the updated train and test sets
print("\nUpdated Train Set (after label encoding):\n", X.head())
print("\nUpdated Test Set (after label encoding):\n", test_data_cleaned.head())



Updated Train Set (after label encoding):
    X0  X1  X2  X3  X4  X5  X6  X8  X10  X12  ...  X375  X376  X377  X378  \
0  37  23  20   0   3  27   9  14    0    0  ...     0     0     1     0   
1  37  21  22   4   3  31  11  14    0    0  ...     1     0     0     0   
2  24  24  38   2   3  30   9  23    0    0  ...     0     0     0     0   
3  24  21  38   5   3  30  11   4    0    0  ...     0     0     0     0   
4  24  23  38   5   3  14   3  13    0    0  ...     0     0     0     0   

   X379  X380  X382  X383  X384  X385  
0     0     0     0     0     0     0  
1     0     0     0     0     0     0  
2     0     0     1     0     0     0  
3     0     0     0     0     0     0  
4     0     0     0     0     0     0  

[5 rows x 364 columns]

Updated Test Set (after label encoding):
    X0  X1  X2  X3  X4  X5  X6  X8  X10  X12  ...  X375  X376  X377  X378  \
0  37  23  20   0   3  27   9  14    0    0  ...     0     0     0     1   
1  37  21  22   4   3  31  11  14    0  

#### Step 9: Make sure the data is now changed into numericals

In [262]:
# Display the data types of each column
print("Data Types after Label Encoding:\n", test_data_cleaned.dtypes)

# Check for any remaining non-numeric columns
non_numeric_columns_remaining = test_data_cleaned.select_dtypes(exclude=np.number).columns
print("\nNon-Numeric Columns Remaining:\n", non_numeric_columns_remaining)

Data Types after Label Encoding:
 X0      int32
X1      int32
X2      int32
X3      int32
X4      int32
        ...  
X380    int64
X382    int64
X383    int64
X384    int64
X385    int64
Length: 364, dtype: object

Non-Numeric Columns Remaining:
 Index([], dtype='object')


#### Step 10: Perform dimensionality reduction

In [263]:
# Step 10.1: Linear dimensionality reduction using Singular Value Decomposition (SVD)

from sklearn.decomposition import TruncatedSVD

# Specify the number of components for dimensionality reduction
n_components_svd = 10  # Adjust as needed

# Step 10.1: Apply SVD to training set
svd = TruncatedSVD(n_components=n_components_svd)
X_svd = svd.fit_transform(X)

# Check if the number of features in the test set matches the training set
if X.shape[1] == test_data_cleaned.shape[1]:
    # Apply SVD to test set
    test_data_svd = svd.transform(test_data_cleaned)

    # Display the explained variance ratio
    print("Explained Variance Ratio after SVD:\n", svd.explained_variance_ratio_)

    # Display the shape of the reduced train and test sets
    print("\nShape of Reduced Train Set after SVD:", X_svd.shape)
    print("Shape of Reduced Test Set after SVD:", test_data_svd.shape)
else:
    print("Number of features in the training set and test set do not match.")

Explained Variance Ratio after SVD:
 [0.24565996 0.32999136 0.16391651 0.11337014 0.08648512 0.01796575
 0.00705483 0.00468904 0.00339067 0.00248953]

Shape of Reduced Train Set after SVD: (4209, 10)
Shape of Reduced Test Set after SVD: (4209, 10)


#### Step11: Training using xgboost 



In [264]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Step 11: Training using xgboost

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the datasets into DMatrix format, which is the internal data structure that XGBoost uses
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

# Define the XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Regression task with squared error
    'eval_metric': 'rmse',  # Root Mean Squared Error as the evaluation metric
    'max_depth': 6,  # Maximum depth of a tree
    'learning_rate': 0.1,  # Step size shrinkage used in each boosting step
    'subsample': 0.8,  # Fraction of samples used for training trees
    'colsample_bytree': 0.8,  # Fraction of features used for training trees
    'n_estimators': 100  # Number of boosting rounds
}

# Train the XGBoost model
model = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dvalid, 'validation')], early_stopping_rounds=50, verbose_eval=10)

# Make predictions on the validation set
y_pred = model.predict(dvalid)

# Calculate and print the Root Mean Squared Error (RMSE) on the validation set
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"Validation RMSE: {rmse}")


[0]	validation-rmse:11.75450
[10]	validation-rmse:8.52281
[20]	validation-rmse:8.13722


Parameters: { "n_estimators" } are not used.



[30]	validation-rmse:8.17701
[40]	validation-rmse:8.28537
[50]	validation-rmse:8.29410
[60]	validation-rmse:8.34715
[70]	validation-rmse:8.38759
[71]	validation-rmse:8.39184
Validation RMSE: 8.398574652037023


#### Step 12: Predict your test_df values using xgboost


In [265]:
# Step 12: Predict your test_df values using xgboost

# Convert the test dataset into DMatrix format
dtest = xgb.DMatrix(test_data_cleaned)

# Make predictions on the test set
test_predictions = model.predict(dtest)

# Display the predicted values for the test set
print("Predicted Values for Test Set:\n", test_predictions)


Predicted Values for Test Set:
 [ 78.356125  94.23115   78.453896 ...  91.43997  116.34771   92.12124 ]
