In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [18]:
# Function to clean and normalize text
def text_cleaning(text):
    if isinstance(text, str):  # Check if the value is a string
        text = text.strip()  # Remove leading/trailing whitespaces
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'[-]', '', text)  # Remove hyphens
        text = re.sub(r'[^a-z0-9\s]', '', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
        return text.strip()  # Return cleaned text with leading/trailing spaces removed
    else:
        return np.nan  # Return NaN for non-string values

In [19]:
def capitalize_text(text):
    # Split the text into words
    words = text.split(' ')

    # Capitalize the first letter of each word
    capitalized_words = [word.capitalize() for word in words]

    # Join the capitalized words back into a single string
    return ' '.join(capitalized_words)

In [20]:
df = pd.read_csv(r'/content/drive/MyDrive/united2/RAw/data.csv')

In [21]:
# taking relvant feature
df2 = df[['call_id', 'average_sentiment', 'call_hour', 'AHT', 'AST', 'primary_call_reason']].copy()
# pre-processing the text data
df2['primary_call_reason'] = df2['primary_call_reason'].apply(text_cleaning)
df2['primary_call_reason'] = df2['primary_call_reason'].apply(capitalize_text)



In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Encode the target labels (primary call reason)
label_encoder = LabelEncoder()
df2['primary_call_reason_encoded'] = label_encoder.fit_transform(df2['primary_call_reason'])

# Define features (X) and target (y)
X = df2[['call_id', 'average_sentiment', 'call_hour', 'AHT', 'AST']]
y = df2['primary_call_reason_encoded']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model (default parameters)
xgb_model = XGBClassifier(random_state=42)

# Fit the model on the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')

# Convert predicted labels back to their original text form
predicted_labels = label_encoder.inverse_transform(y_pred)


Model Accuracy: 35.58%


In [30]:
# Load the test data
test_df = pd.read_csv(r'/content/drive/MyDrive/united2/RAw/test.csv')

# Extract the call_id column from the test data
test_X = test_df[['call_id']]

# Merge the test data with the original dataframe to get other features
test_data = pd.merge(test_X, df2, on='call_id', how='left')

# Select the features for prediction (excluding 'call_id')
test_X = test_data[['call_id','average_sentiment', 'call_hour', 'AHT', 'AST']]

# Make predictions using the best model
y_pred_test = best_model.predict(test_X)

# Convert predicted labels back to their original text form
predicted_labels_test = label_encoder.inverse_transform(y_pred_test)

# Add the predicted labels to the test dataframe
test_df['predicted_primary_call_reason'] = predicted_labels_test

# Save the results to a new CSV file
test_df.to_csv('/content/drive/MyDrive/united2/RAw/test_with_predictions.csv', index=False)

print("Predictions saved to test_with_predictions.csv")


Predictions saved to test_with_predictions.csv


## Signing Off
Team OR Focks