In [101]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score, GroupKFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.cluster import FeatureAgglomeration

from sklearn.linear_model import LogisticRegression

from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier, plot_tree, export_graphviz

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report
# , plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve
from sklearn.compose import ColumnTransformer, make_column_transformer, make_column_selector

### Feature Selection

In [102]:
df = pd.read_csv("..\Data\csgo_round_snapshots_cleaned.csv")
df.head()

Unnamed: 0,time_left,ct_score,t_score,map,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,...,ct_grenade_molotovgrenade,t_grenade_molotovgrenade,ct_grenade_decoygrenade,t_grenade_decoygrenade,round_winner,t_weapon_count,ct_weapon_count,t_grenade_count,ct_grenade_count,round_winner_encoded
0,175.0,0.0,0.0,de_dust2,False,500.0,500.0,0.0,0.0,4000.0,...,0.0,0.0,0.0,0.0,CT,5.0,5.0,4510.0,4510.0,1
1,156.03,0.0,0.0,de_dust2,False,500.0,500.0,400.0,300.0,600.0,...,0.0,0.0,0.0,0.0,CT,5.0,5.0,1462.0,1511.0,1
2,96.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,...,0.0,0.0,0.0,0.0,CT,4.0,4.0,1110.0,1444.0,1
3,76.03,0.0,0.0,de_dust2,False,391.0,400.0,294.0,200.0,750.0,...,0.0,0.0,0.0,0.0,CT,4.0,4.0,1108.0,1444.0,1
4,174.97,1.0,0.0,de_dust2,False,500.0,500.0,192.0,0.0,18350.0,...,0.0,0.0,0.0,0.0,CT,5.0,5.0,11260.0,19054.0,1


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117448 entries, 0 to 117447
Columns: 102 entries, time_left to round_winner_encoded
dtypes: bool(1), float64(98), int64(1), object(2)
memory usage: 90.6+ MB


#### Optimize data types

In [114]:

import logging

# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

In [104]:
def convert_numeric_to_efficient_dtype(data):
    """
    Convert numeric data (integers) to the most efficient data type.
    
    Parameters:
        data (int or pd.Series): Input integer value or pandas Series containing integers.
    
    Returns:
        pd.Series or int: A pandas Series or single integer with the converted data type.
    """
    if isinstance(data, pd.Series):
        try:
            data.astype(np.int8, copy=False)
            return data.astype(np.int8, copy=False)
        except (ValueError, OverflowError) as e:
            logging.warning(f"Failed to convert to int8: {e}")
            try:
                data.astype(np.int16, copy=False)
                return data.astype(np.int16, copy=False)
            except (ValueError, OverflowError) as e:
                logging.warning(f"Failed to convert to int16: {e}")
                try:
                    data.astype(np.int32, copy=False)
                    return data.astype(np.int32, copy=False)
                except (ValueError, OverflowError) as e:
                    logging.warning(f"Failed to convert to int32: {e}")
                    return data.astype(np.int64, copy=False)

    elif isinstance(data, int):
        series_data = pd.Series([data])
        return convert_numeric_to_efficient_dtype(series_data).iloc[0]

    else:
        error_message = "Input must be an integer or a pandas Series of integers."
        logging.error(error_message)
        raise TypeError(error_message)


In [105]:
def convert_float_to_efficient_dtype(data):
    """
    Convert float data to the most efficient data type.
    
    Parameters:
        data (float or pd.Series): Input float value or pandas Series containing float values.
    
    Returns:
        pd.Series or float: A pandas Series or single float with the converted data type.
    """
    if isinstance(data, pd.Series):

        try:
            return data.astype(np.float16, copy=False)
        except (ValueError, OverflowError) as e:
            logging.warning(f"Failed to convert to float16: {e}")
            try:
                return data.astype(np.float32, copy=False)
            except (ValueError, OverflowError) as e:
                logging.warning(f"Failed to convert to float32: {e}")
                return data.astype(np.float64, copy=False)

    elif isinstance(data, float):
        series_data = pd.Series([data])
        return convert_float_to_efficient_dtype(series_data).iloc[0]

    else:
        error_message = "Input must be a float or a pandas Series of floats."
        logging.error(error_message)
        raise TypeError(error_message)


In [106]:
def convert_object_to_efficient_dtype(data):
    """
    Convert object data (strings) to the most efficient category type.
    
    Parameters:
        data (str or pd.Series): Input string value or pandas Series containing strings.
    
    Returns:
        pd.Series or str: A pandas Series or single string with the converted category type.
    """
    if isinstance(data, pd.Series):
        try:
            # Attempt to convert to categorical type
            return data.astype('category', copy=False)
        except Exception as e:
            logging.error(f"Failed to convert Series to categorical: {e}")
            raise

    elif isinstance(data, str):
        series_data = pd.Series([data])
        return convert_object_to_efficient_dtype(series_data).iloc[0]

    else:
        error_message = "Input must be a string or a pandas Series of strings."
        logging.error(error_message)
        raise TypeError(error_message)


In [107]:
def optimize_data_types(df, numeric_cols, boolean_cols, categorical_cols):
    """Optimize data types for efficient memory usage."""
    try:
        for col in numeric_cols:
            if df[col].dtype.kind in 'i':  # integer types
                df[col] = convert_numeric_to_efficient_dtype(df[col])
            elif df[col].dtype.kind in 'f':  # float types
                df[col] = convert_float_to_efficient_dtype(df[col])

        for col in boolean_cols:
            df[col] = df[col].astype('bool')  # Use bool for boolean columns

        for col in categorical_cols:
            df[col] = convert_object_to_efficient_dtype(df[col])  # Use category for categorical columns
            
    except Exception as e:
        logging.error(f"Error optimizing data types: {e}")


In [108]:
df['round_winner'].dtype

dtype('O')

In [119]:
if df is not None:
    try:
        # Define column types
                 
        numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
        boolean_columns = df.select_dtypes(include=[np.bool_]).columns.tolist()
        categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
        
        # Remove target column from categorical columns
        if 'round_winner' in categorical_columns:
            categorical_columns.remove('round_winner')  
        target_column = df['round_winner']  
        # Optimize data types
        optimize_data_types(df, numeric_columns, boolean_columns, categorical_columns)
        
    except Exception as e:
        logging.error(f"Error processing DataFrame: {e}")
else:
    logging.warning("The DataFrame is None.")


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117448 entries, 0 to 117447
Columns: 102 entries, time_left to round_winner_encoded
dtypes: bool(1), category(1), float16(98), int8(1), object(1)
memory usage: 23.2+ MB


In [111]:
# memory requirement decreased from 90.6+ MB to 23.2+ MB

In [112]:
df.dtypes.value_counts()

float16     98
category     1
bool         1
object       1
int8         1
Name: count, dtype: int64

In [113]:
df.dtypes.nunique()

5

### converting to numerical 

In [118]:
categorical_columns

['map']

In [117]:
target_column

0         CT
1         CT
2         CT
3         CT
4         CT
          ..
117443     T
117444     T
117445     T
117446     T
117447     T
Name: round_winner, Length: 117448, dtype: object

In [123]:
def label_encode_column(df, column_name):
    """
    Label encode a specified column in the DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to encode.
        column_name (str): The name of the column to be label encoded.

    Returns:
        pd.Series: The label encoded column as a pandas Series.
    """
    try:
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
        
        # Initialize the LabelEncoder
        le = LabelEncoder()
        
        # Fit and transform the specified column
        df[column_name] = le.fit_transform(df[column_name])
        
        logging.info(f"Successfully label encoded column '{column_name}'.")
        return df[column_name]
    
    except ValueError as ve:
        logging.error(f"ValueError: {ve}")
        raise  # Re-raise the exception after logging
    except Exception as e:
        logging.error(f"Error encoding column '{column_name}': {e}")
        raise  # Re-raise the exception after logging


  



In [124]:
def one_hot_encode_column(df, column_name):
    """
    One-hot encode a specified column in the DataFrame.

    Parameters:
        df (pd.DataFrame): The DataFrame containing the column to encode.
        column_name (str): The name of the column to be one-hot encoded.

    Returns:
        pd.DataFrame: The DataFrame with the one-hot encoded columns added.
    """
    try:
        if column_name not in df.columns:
            raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
        
        # Perform one-hot encoding
        one_hot_encoded = pd.get_dummies(df[column_name], prefix=column_name, drop_first=True)
        
        # Concatenate the original DataFrame with the new one-hot encoded columns
        df = pd.concat([df, one_hot_encoded], axis=1)
        
        # Drop the original categorical column
        df.drop(column_name, axis=1, inplace=True)
        
        logging.info(f"Successfully one-hot encoded column '{column_name}'.")
        return df
    
    except ValueError as ve:
        logging.error(f"ValueError: {ve}")
        raise  # Re-raise the exception after logging
    except Exception as e:
        logging.error(f"Error encoding column '{column_name}': {e}")
        raise  # Re-raise the exception after logging



In [127]:
one_hot_encode_column(df, 'map')


2025-01-21 11:24:43,529 - INFO - Successfully one-hot encoded column 'map'.


Unnamed: 0,time_left,ct_score,t_score,bomb_planted,ct_health,t_health,ct_armor,t_armor,ct_money,t_money,...,t_grenade_count,ct_grenade_count,round_winner_encoded,map_de_dust2,map_de_inferno,map_de_mirage,map_de_nuke,map_de_overpass,map_de_train,map_de_vertigo
0,175.00000,0.0,0.0,False,500.0,500.0,0.0,0.0,4000.0,4000.0,...,4512.0,4512.0,1,True,False,False,False,False,False,False
1,156.00000,0.0,0.0,False,500.0,500.0,400.0,300.0,600.0,650.0,...,1462.0,1511.0,1,True,False,False,False,False,False,False
2,96.00000,0.0,0.0,False,391.0,400.0,294.0,200.0,750.0,500.0,...,1110.0,1444.0,1,True,False,False,False,False,False,False
3,76.00000,0.0,0.0,False,391.0,400.0,294.0,200.0,750.0,500.0,...,1108.0,1444.0,1,True,False,False,False,False,False,False
4,175.00000,1.0,0.0,False,500.0,500.0,192.0,0.0,18352.0,10752.0,...,11264.0,19056.0,1,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117443,15.40625,11.0,14.0,True,200.0,242.0,195.0,359.0,100.0,5952.0,...,6584.0,517.0,0,False,False,False,False,False,True,False
117444,174.87500,11.0,15.0,False,500.0,500.0,95.0,175.0,11504.0,23904.0,...,24608.0,12120.0,0,False,False,False,False,False,True,False
117445,114.93750,11.0,15.0,False,500.0,500.0,495.0,475.0,1200.0,6700.0,...,7724.0,2232.0,0,False,False,False,False,False,True,False
117446,94.93750,11.0,15.0,False,500.0,500.0,495.0,475.0,1200.0,6700.0,...,7720.0,2224.0,0,False,False,False,False,False,True,False


In [129]:
y = label_encode_column(df, column_name='round_winner')
y

2025-01-21 11:26:11,982 - INFO - Successfully label encoded column 'round_winner'.


0         0
1         0
2         0
3         0
4         0
         ..
117443    1
117444    1
117445    1
117446    1
117447    1
Name: round_winner, Length: 117448, dtype: int64

In [None]:
# df.drop('round_winner', axis=1, inplace=True)

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117448 entries, 0 to 117447
Columns: 102 entries, time_left to round_winner_encoded
dtypes: bool(1), category(1), float16(98), int64(1), int8(1)
memory usage: 23.2 MB
