In [9]:
import pandas as pd
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [10]:
# Load the Excel file
file_path = r"Masters Data Set FINAL.xlsx"  # Update with your file path
df = pd.read_excel(file_path)

In [11]:
# Define a function to analyze sentiment using TextBlob
def analyze_sentiment(description):
    # Check if description is a string and not empty
    if isinstance(description, str) and description.strip() != "":
        blob = TextBlob(description)
        sentiment_score = blob.sentiment.polarity
        # Sentiment classification: Positive, Negative, Neutral
        if sentiment_score > 0:
            sentiment_label = 'Positive'
        elif sentiment_score < 0:
            sentiment_label = 'Negative'
        else:
            sentiment_label = 'Neutral'
        return sentiment_score, sentiment_label
    return 0, 'Neutral'  # Default to Neutral if description is missing or invalid

In [12]:
# Apply the sentiment analysis function to the 'Accident Description' column
df['Sentiment_Score'], df['Sentiment_Label'] = zip(*df['Accident Description'].apply(analyze_sentiment))

In [13]:
# Convert 'Litigation Flag' column (Y = litigated, N = not litigated) into binary values
df['Litigated'] = df['Litigation Flag'].map({'Y': 1, 'N': 0})

In [15]:
# Drop rows where essential information is missing
df = df.dropna(subset=['Litigated', 'Accident Description'])

# Feature Engineering: Use Sentiment Score, Sentiment Label, and other relevant columns
X = df[['Sentiment_Score']]  # You can add more features from the data here
y = df['Litigated']  # The target variable (litigation flag converted to binary)

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a classification model (e.g., Random Forest)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Optional: Save the trained model if needed
joblib.dump(model, 'litigation_prediction_model.pkl')

# Make predictions on the entire dataset (or new data) and add to a new column
df['Predicted_Litigation_Flag'] = model.predict(X)

# Save the updated DataFrame with the predicted litigation flag back to Excel
output_file_path = "accident_data_with_predictions.xlsx"  # Specify your output file path
df.to_excel(output_file_path, index=False)

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      6882
           1       0.00      0.00      0.00       374

    accuracy                           0.95      7256
   macro avg       0.47      0.50      0.49      7256
weighted avg       0.90      0.95      0.92      7256

