In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier  # Import XGBClassifier from xgboost module
from sklearn.metrics import accuracy_score














In [None]:
# Read the Data
train_data = pd.read_csv('C:/Users/KIIT/Datasets/Consumer_Complaints_test.csv')
test_data = pd.read_csv('C:/Users/KIIT/Datasets/Consumer_Complaints_train.csv')
train_data 
test_data  

In [None]:

# Check data type for both datasets
print("Train Data Types:")
print(train_data.dtypes)
print("\nTest Data Types:")
print(test_data.dtypes) 


In [None]:

# Missing value analysis and dropping columns with more than 25% missing data
threshold = len(train_data) * 0.25
train_data.dropna(thresh=threshold, axis=1, inplace=True)
test_data.dropna(thresh=threshold, axis=1, inplace=True) 


In [None]:
# Extracting Day, Month, and Year
train_data['Date received'] = pd.to_datetime(train_data['Date received'])
test_data['Date received'] = pd.to_datetime(test_data['Date received'])

train_data['Month'] = train_data['Date received'].dt.month
train_data['Year'] = train_data['Date received'].dt.year
train_data['Day'] = train_data['Date received'].dt.day

test_data['Month'] = test_data['Date received'].dt.month
test_data['Year'] = test_data['Date received'].dt.year
test_data['Day'] = test_data['Date received'].dt.day 



In [None]:

# Extracting Day, Month, and Year
train_data['Date received'] = pd.to_datetime(train_data['Date received'])
test_data['Date received'] = pd.to_datetime(test_data['Date received'])


In [None]:

# Calculate Number of Days the Complaint was with the Company
train_data['Days Held'] = (pd.to_datetime(train_data['Date sent to company']) - train_data['Date received']).dt.days
test_data['Days Held'] = (pd.to_datetime(test_data['Date sent to company']) - test_data['Date received']).dt.days 


In [None]:

# Drop unnecessary columns
train_data.drop(columns=['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], inplace=True)
test_data.drop(columns=['Date received', 'Date sent to company', 'ZIP code', 'Complaint ID'], inplace=True)


In [None]:



# Impute Null value in “State” by Mode
train_data['State'].fillna(train_data['State'].mode()[0], inplace=True)
test_data['State'].fillna(test_data['State'].mode()[0], inplace=True)

In [None]:

# Create 'Week_Received' column
train_data['Week_Received'] = train_data['Date received'].dt.week
test_data['Week_Received'] = test_data['Date received'].dt.week

In [None]:


# Store data of disputed people
disputed_cons = train_data[train_data['Consumer disputed?'] == 'Yes'] 

In [None]:

# Plot bar graph of total no of disputes of consumers
sns.countplot(x='Consumer disputed?', data=train_data)
plt.title('Total Number of Disputes')
plt.show()

In [None]:
# Plot bar graph of total no of disputes products-wise
sns.countplot(x='Product', hue='Consumer disputed?', data=train_data)
plt.title('Disputes by Product')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Plot bar graph of total no of disputes with Top Issues by Highest Disputes
top_issues = train_data['Issue'].value_counts().nlargest(5).index
sns.countplot(x='Issue', hue='Consumer disputed?', data=train_data[train_data['Issue'].isin(top_issues)])
plt.title('Disputes by Top Issues')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Plot bar graph of total no of disputes by State with Maximum Disputes
state_disputes = train_data.groupby('State')['Consumer disputed?'].count().sort_values(ascending=False).head(10)
state_disputes.plot(kind='bar')
plt.title('Disputes by State')
plt.xlabel('State')
plt.ylabel('Number of Disputes')
plt.show()

In [None]:

# Plot bar graph of total no of disputes Submitted Via different source
sns.countplot(x='Submitted via', hue='Consumer disputed?', data=train_data)
plt.title('Disputes by Submission Source')
plt.show()

In [None]:

# Plot bar graph of total no of disputes where the Company's Response to the Complaints
sns.countplot(x='Company response to consumer', hue='Consumer disputed?', data=train_data)
plt.title('Disputes by Company Response')
plt.xticks(rotation=90)
plt.show()


In [None]:

# Plot bar graph of total no of disputes. Whether there are Disputes Instead of Timely Response
sns.countplot(x='Timely response?', hue='Consumer disputed?', data=train_data)
plt.title('Disputes by Timely Response')
plt.show()

In [None]:


# Plot bar graph of total no of disputes over Year Wise Complaints
sns.countplot(x='Year', hue='Consumer disputed?', data=train_data)
plt.title('Disputes over Year Wise Complaints')
plt.show()

In [None]:

# Plot bar graph of total no of disputes over Year Wise Disputes
sns.countplot(x='Year', hue='Consumer disputed?', data=disputed_cons)
plt.title('Disputes over Year Wise Disputes')
plt.show()


In [None]:



# Drop unnecessary columns for model building
train_data.drop(columns=['Company', 'State', 'Year_Received', 'Days_held'], inplace=True)
test_data.drop(columns=['Company', 'State', 'Year_Received', 'Days_held'], inplace=True)

In [None]:


# Change Consumer Disputed Column to 0 and 1
label_encoder = LabelEncoder()
train_data['Consumer disputed?'] = label_encoder.fit_transform(train_data['Consumer disputed?'])
test_data['Consumer disputed?'] = label_encoder.transform(test_data['Consumer disputed?'])


In [None]:


# Create Dummy Variables for categorical features
cat_cols = ['Product', 'Submitted via', 'Company response to consumer', 'Timely response?']
train_data = pd.get_dummies(train_data, columns=cat_cols, drop_first=True)
test_data = pd.get_dummies(test_data, columns=cat_cols, drop_first=True)


In [None]:



# Scaling the Data Sets
scaler = StandardScaler()
X_train = train_data.drop(columns=['Consumer disputed?'])
X_test = test_data.drop(columns=['Consumer disputed?'])

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Make feature selection with PCA up to 80% of the information
pca = PCA(n_components=0.8)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [None]:

# Splitting the Data Sets Into X and Y by the dependent and independent variables
y_train = train_data['Consumer disputed?']
y_test = test_data['Consumer disputed?']


In [None]:
# Build given models and measure their test and validation accuracy
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "KNeighbors": KNeighborsClassifier(),
    "XGBoost": XGBClassifier()
}

for name, model in models.items():
    model.fit(X_train_pca, y_train)
    train_pred = model.predict(X_train_pca)
    test_pred = model.predict(X_test_pca)
    train_acc = accuracy_score(y_train, train_pred)
    test_acc = accuracy_score(y_test, test_pred)
    print(f"{name}: Train Accuracy: {train_acc}, Test Accuracy: {test_acc}") 

In [None]:



# Predict the outcome for the test file
best_model = models["Random Forest"]
test_data['Consumer disputed?'] = best_model.predict(X_test_pca)

# Save the output
test_data.to_csv("predicted_test_data.csv", index=False)