In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import contractions

## Get the reviews from all the existing car brands

We should keep only the files (from edmunds.com) corresponding to the unique car brands from the autovit dataset.

In [6]:
# get unique brands from the autovit dataset
df = pd.read_csv("../data/autovit_data.csv")
unique_brands = [
    brand.lower().replace("-", "").replace(" ", "")
    for brand in df["make"].unique()
]

print(unique_brands)
print(len(unique_brands))

['ford', 'audi', 'toyota', 'skoda', 'bmw', 'chevrolet', 'dacia', 'mitsubishi', 'jaguar', 'mercedesbenz', 'kia', 'volkswagen', 'seat', 'honda', 'citroën', 'hyundai', 'suzuki', 'opel', 'renault', 'volvo', 'mini', 'porsche', 'peugeot', 'nissan', 'mazda', 'landrover', 'fiat', 'alfaromeo', 'jeep', 'lamborghini', 'bentley', 'lexus']
32


In [None]:
files_path = "../data/archive/"
brands_dict = {}
pattern = r'_(.*?)\.csv'
regex = re.compile(pattern)

for csv_file in os.listdir(files_path):
    brand = csv_file.rsplit('_', 1)[-1].split('.')[0].lower()
    brand = brand.replace("-", "").replace(" ", "")
    brands_dict[brand] = csv_file 

print(len(brands_dict))
print(brands_dict)

In total, there are 50 car brands, and in the end, there should be 32, once correlated with the brands from the Autovit dataset.

In [43]:
# get file names
filenames = [
    filename
    for brand, filename in brands_dict.items()
    if brand in unique_brands
]

len(filenames)

25

It seems that in the end there are 25 car brands. So, 7 brands are missing, including Skoda, Seat, etc.

In [45]:
df_concat = pd.DataFrame()
dataframes_from_csv = []

for csv_filename in filenames:
    df_temp = pd.read_csv(files_path + csv_filename, lineterminator="\n")
    dataframes_from_csv.append(df_temp)

df_concat = pd.concat(dataframes_from_csv, ignore_index=True)
df_concat.shape

(167313, 7)

#### Save reviews in their original form before preprocessing

In [46]:
df_concat.to_csv("../data/edmunds_reviews.csv")

### Preprocessing
* remove punctuation
* remove stopwords
* lemmatize words

In [5]:
df = pd.read_csv("../data/edmunds_reviews.csv", lineterminator="\n")

In [7]:
df.drop("Unnamed: 0", axis=1, inplace=True)
# df.drop("Unnamed: 0.1", axis=1, inplace=True)
df.head(2)

Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating\r
0,on 09/18/11 00:19 AM (PDT),wizbang_fl,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",I've had my Beetle Convertible for over 4.5 y...,4.5
1,on 07/07/10 05:28 AM (PDT),carlo frazzano,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,We bought the car new in 2007 and are general...,4.375


In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import Counter


lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = stopwords.words("english")
stopwords_dict = Counter(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alex9popa/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/alex9popa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def preprocess_text(text):
    text = ' '.join(
        [
            lemmatizer.lemmatize(word.lower())
            for word in text.split()
            if word not in stopwords_dict
        ]
    )
    return text

In [33]:
example = df["Review"][0]
print(example + "\n")
print(preprocess_text(example))

 I've had my Beetle Convertible for over 4.5 years andhave been overall happy with the car. It is a compact convertible.Don't expect a big trunk or having tall people in the back seat.

i've beetle convertible 4.5 year andhave overall happy car. it compact convertible.don't expect big trunk tall people back seat.


In [10]:
df["Review"] = df["Review"].apply(preprocess_text)

In [11]:
df.head(2)

Unnamed: 0,Review_Date,Author_Name,Vehicle_Title,Review_Title,Review,Rating\r
0,on 09/18/11 00:19 AM (PDT),wizbang_fl,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",i've beetle convertible 4.5 year andhave overa...,4.5
1,on 07/07/10 05:28 AM (PDT),carlo frazzano,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,we bought car new 2007 generally satisfied. me...,4.375


#### Save the preprocessed (lemmatize + stopwords) dataframe to csv. 

In [12]:
df.to_csv("../data/edmunds_preprocessed_reviews.csv")

### Remove punctuation after applying contractions:

In [2]:
def remove_punction_and_contractions(text):
    return re.sub(r'[^\w\s]', '', contractions.fix(str(text)))

In [3]:
# remove punctuations and word shortcuts
df = pd.read_csv("../data/edmunds_preprocessed_reviews.csv")
df["Review"] = df["Review"].apply(remove_punction_and_contractions)

# drop unnecessary columns
df.drop("Review_Date", axis=1, inplace=True)
df.drop("Unnamed: 0", axis=1, inplace=True)
df.drop("Author_Name", axis=1, inplace=True)

# round car ratings to the nearest .5 value
df["Rating"] = df["Rating"].apply(lambda x: round(x * 2) / 2)
df.head(2)

Unnamed: 0,Vehicle_Title,Review_Title,Review,Rating
0,2007 Volkswagen New Beetle Convertible 2.5 2dr...,"New Beetle- Holds up well & Fun to Drive, but ...",i have beetle convertible 45 year andhave over...,4.5
1,2007 Volkswagen New Beetle Convertible 2.5 PZE...,Quality Review,we bought car new 2007 generally satisfied mec...,4.5


In [6]:
df.isna().sum()

Vehicle_Title     0
Review_Title     19
Review            0
Rating            0
dtype: int64

In [7]:
# remove remaining NaN rows
df.dropna(subset=["Review_Title"], inplace=True)

In [None]:
df.to_csv("../data/final_edmunds_reviews.csv", index=False)