## Main steps:

1. Look at the big picture.
2. Get the data.
3. Explore and visualize the data to gain insights.
4. Prepare the data for machine learning algorithms.
5. Select a model and train it.
6. Output as CSV.

In [8]:
#import libraries
import pandas as pd
import numpy as numpy

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [9]:
#Loading the data

def load_data():
    """
    Load the data from the DrivenData competition into a pandas dataframe

    Returns:
        Pandas dataframes of the training values, training labels, test values,
        and submission format
    """

    # The submission format
    # submission_format
    # this is what our .csv output file should look like
    # make sure damage_grade is integer, not float!
    url_submission_format = ('https://drivendata-prod.s3.amazonaws.com/data/57/public/submission_format.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20240201%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240201T103937Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=2546ac2508675ca4e0161409520fab1e3552e9342e03468074572960192fc12c')
    submission_format = pd.read_csv(url_submission_format)

    # The test values
    url_test_values = ('https://drivendata-prod.s3.amazonaws.com/data/57/public/test_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20240201%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240201T103937Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=5782f88e43df91a5b05951c6c868b1b9699b281042a38066e35aacba349e329c')
    test_values = pd.read_csv(url_test_values)

    # The training labels
    url_train_labels = ('https://drivendata-prod.s3.amazonaws.com/data/57/public/train_labels.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20240201%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240201T103937Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=36d24a38b88f5c6a4acddcbbc1db9b6518b71e50ec03a74fbe84fc30a62a9acc')
    train_labels = pd.read_csv(url_train_labels)

    # The training features
    url_train_values = ('https://drivendata-prod.s3.amazonaws.com/data/57/public/train_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCYQTZTLQOS%2F20240201%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240201T103937Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=be3146653c1a73b442dce121ce46ccef21a67a28347eefc2c62c473eff0a00e8')
    train_values = pd.read_csv(url_train_values)

    return train_values, train_labels, test_values, submission_format

train_values, train_labels, test_values, submission_format = load_data()


In [10]:
#Preprocess the data
def preprocess_data(train_values, train_labels):
    # Select only numerical columns
    numerical_train_values = train_values.select_dtypes(include=['number'])
    
    # Scale numerical features
    scaler = StandardScaler()
    train_values_scaled = scaler.fit_transform(numerical_train_values)

    # Ensure labels are integers
    train_labels = train_labels.astype(int)

    return train_values_scaled, train_labels

# Preprocess data
X_train, y_train = preprocess_data(train_values, train_labels['damage_grade'])  # Assuming 'damage_grade' is the label column

def preprocess_test_data(test_values):
    # Select only numerical columns
    numerical_test_values = test_values.select_dtypes(include=['number'])
    
    # Scale features
    scaler = StandardScaler()
    test_values_scaled = scaler.fit_transform(numerical_test_values)
    
    return test_values_scaled


In [None]:
#Explore and visualise

#Prepare the data

In [None]:
#Select a model and train it

def train_ordinal_logistic(X_train, y_train):
    """
    Train an ordinal logistic regression model using the LogisticAT model from mord.
    
    Parameters:
    - X_train: Training feature set.
    - y_train: Training target variable.

    Returns:
    - The trained ordinal logistic regression model.
    """
    
    # Standardize features by removing the mean and scaling to unit variance
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    # Initialize and train the LogisticAT model (which stands for Logistic All Thresholds)
    model = m.LogisticAT()
    model.fit(X_train_scaled, y_train)
    
    return model

# Train the model
model = train_ordinal_logistic(X_train, y_train)
model

In [None]:
# predict


def predict(model, test_values, submission_format):
    """
    Make predictions with a trained model on the test dataset and format the output for competition submission.

    Parameters:
    - model: The trained machine learning model.
    - test_values: The raw features of the test dataset (prior to preprocessing).
    - submission_format: A DataFrame providing the format required for submission, including an ID column.

    Returns:
    - DataFrame containing predictions in the required submission format.
    """

    # Preprocess test data using the existing function
    test_values_preprocessed = preprocess_test_data(test_values)

    # Make predictions on preprocessed test data
    predictions = model.predict(test_values_preprocessed)

    # Format predictions for submission using the submission_format DataFrame
    formatted_predictions = submission_format.copy()
    formatted_predictions['damage_grade'] = predictions.astype(int)  # Ensure predictions are integers

    return formatted_predictions

# Example usage, assuming test_values and submission_format are defined
formatted_predictions = predict(model, test_values, submission_format)

# To save the predictions to a CSV file compatible with the competition's submission format
formatted_predictions.to_csv('submission.csv', index=False)


In [2]:
#Output as CSV