# Data Wrangling
In this notebook, I further wrangle and format the data in order to prepare it for modeling.

In the following cells, I:
*   Apply text cleaning to the summaries (lowercasing, removing punctuation, etc.)
*   Encoding the target labels (components) as integers for classification

The final DataFrame consists of:
*    A cleaned summary column for vectorization
*    A numeric label column



In [1]:
import pandas as pd
import re
import string
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

# Mounting google drive and loading cleaned dataset
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/vehicle-complaints-project/complaints_cleaned_top10.csv')
print("Loaded dataset shape:", df.shape)
df.head()

Mounted at /content/drive
Loaded dataset shape: (149207, 15)


Unnamed: 0,odiNumber,manufacturer,crash,fire,numberOfInjuries,numberOfDeaths,dateOfIncident,dateComplaintFiled,vin,components,summary,products,make,model,modelYear
0,10713088,"Kia America, Inc.",False,False,0,0,04/24/2015,04/27/2015,5XYPH4A15GG,structure,"traveling at highway speed, the windshield see...","[{""type"": ""Vehicle"", ""productYear"": ""2016"", ""p...",KIA,SORENTO,2016
1,10713503,"ALUMA TOWER COMPANY, INC",False,True,0,0,04/28/2015,04/29/2015,1YGAE1629GB,electrical system,tl* the contact owns a 2016 aluma ae716ta trai...,"[{""type"": ""Vehicle"", ""productYear"": ""2016"", ""p...",ALUMA,AE716TA,2016
2,10715078,"Kia America, Inc.",False,False,0,0,04/30/2015,05/07/2015,,steering,"while traveling at highway speeds, the sorento...","[{""type"": ""Vehicle"", ""productYear"": ""2016"", ""p...",KIA,SORENTO,2016
3,10721462,"Kia America, Inc.",False,False,0,0,05/24/2015,05/26/2015,5XYPKDA1XGG,vehicle speed control,tl* the contact owns a 2016 kia sorento. while...,"[{""type"": ""Vehicle"", ""productYear"": ""2016"", ""p...",KIA,SORENTO,2016
4,10725192,Mazda Motor Corp.,False,False,0,0,04/15/2015,06/15/2015,,visibility/wiper,"glass cracked from top, middle portion of wind...","[{""type"": ""Vehicle"", ""productYear"": ""2016"", ""p...",MAZDA,CX-5,2016


In [2]:
# Creating a function to clean the summary texts
def clean_summary(summary):
  summary = summary.lower()
  summary = re.sub(r'\n', ' ', summary)                     # remove line breaks
  summary = re.sub(r'https?://\S+|www\.\S+', '', summary)   # remove URLs
  summary = re.sub(r'[^a-z\s]', '', summary)                # remove punctuation/numbers
  summary = re.sub(r'\s+', ' ', summary).strip()            # remove extra spaces
  return summary

df["cleaned summary"] = df["summary"].apply(clean_summary)

# drop nan values
df = df.dropna(subset=['cleaned summary'])
df = df.reset_index(drop=True)

df[["summary", "cleaned summary"]].head()

Unnamed: 0,summary,cleaned summary
0,"traveling at highway speed, the windshield see...",traveling at highway speed the windshield seem...
1,tl* the contact owns a 2016 aluma ae716ta trai...,tl the contact owns a aluma aeta trailer na wh...
2,"while traveling at highway speeds, the sorento...",while traveling at highway speeds the sorento ...
3,tl* the contact owns a 2016 kia sorento. while...,tl the contact owns a kia sorento while drivin...
4,"glass cracked from top, middle portion of wind...",glass cracked from top middle portion of winds...


In [3]:
# We need to encode the component column as numbers so that our ML models can
# interpret the information
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['components'])

# Mapping classes for reference later on
label_map = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Label Mapping:")
print(label_map)

df[["components", "label"]].head(10)

Label Mapping:
{'air bags': np.int64(0), 'electrical system': np.int64(1), 'engine': np.int64(2), 'fuel/propulsion system': np.int64(3), 'power train': np.int64(4), 'service brakes': np.int64(5), 'steering': np.int64(6), 'structure': np.int64(7), 'vehicle speed control': np.int64(8), 'visibility/wiper': np.int64(9)}


Unnamed: 0,components,label
0,structure,7
1,electrical system,1
2,steering,6
3,vehicle speed control,8
4,visibility/wiper,9
5,service brakes,5
6,visibility/wiper,9
7,power train,4
8,electrical system,1
9,power train,4


In [4]:
# Ensuring that there are no duplicate entries in the dataset
df.drop_duplicates(subset=['cleaned summary', 'components'], inplace=True)
df = df.reset_index(drop=True)

# Checking final shape
print("Final data shape: ", df.shape)

# Save this final dataset for modelling
df.to_csv('/content/drive/MyDrive/vehicle-complaints-project/complaints_wrangled.csv', index=False)
print("Wrangled dataset saved.")

Final data shape:  (147422, 17)
Wrangled dataset saved.
