In [66]:
import pandas as pd
import re
import string

In [67]:
df = pd.read_csv("reddit_ev_posts.csv")

In [68]:
df.head(5)

Unnamed: 0,subreddit,title,selftext,created_utc,upvotes,num_comments,url
0,electricvehicles,Audi Went To China To Build Cars. China Rebuil...,,2025-06-02 14:34:41,5,0,https://insideevs.com/news/761013/china-audi-r...
1,electricvehicles,What can we actually do with EV batteries once...,"Hey everyone,\n\nI’m exploring the idea of reu...",2025-06-02 12:41:52,2,9,https://www.reddit.com/r/electricvehicles/comm...
2,electricvehicles,General Questions and Purchasing Advice Thread...,"**Need help choosing an EV, finding a home cha...",2025-06-02 14:00:58,1,0,https://www.reddit.com/r/electricvehicles/comm...
3,electricvehicles,EV Batteries Got 20% Cheaper Last Year,,2025-06-02 13:55:19,97,13,https://insideevs.com/news/761338/ev-battery-c...
4,electricvehicles,Thoughts on Automakers having control over you...,There has been a recent uproar regarding Zeekr...,2025-06-02 09:47:23,8,17,https://www.reddit.com/r/electricvehicles/comm...


## Data Cleaning

In [69]:
df.isnull()

Unnamed: 0,subreddit,title,selftext,created_utc,upvotes,num_comments,url
0,False,False,True,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1195,False,False,False,False,False,False,False
1196,False,False,False,False,False,False,False
1197,False,False,False,False,False,False,False
1198,False,False,True,False,False,False,False


In [70]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1195    False
1196    False
1197    False
1198    False
1199    False
Length: 1200, dtype: bool

In [71]:
# Combining Title + SelfText
df["full_text"] = df["title"].fillna('') + " " + df["selftext"].fillna('')

In [72]:
df.head(5)

Unnamed: 0,subreddit,title,selftext,created_utc,upvotes,num_comments,url,full_text
0,electricvehicles,Audi Went To China To Build Cars. China Rebuil...,,2025-06-02 14:34:41,5,0,https://insideevs.com/news/761013/china-audi-r...,Audi Went To China To Build Cars. China Rebuil...
1,electricvehicles,What can we actually do with EV batteries once...,"Hey everyone,\n\nI’m exploring the idea of reu...",2025-06-02 12:41:52,2,9,https://www.reddit.com/r/electricvehicles/comm...,What can we actually do with EV batteries once...
2,electricvehicles,General Questions and Purchasing Advice Thread...,"**Need help choosing an EV, finding a home cha...",2025-06-02 14:00:58,1,0,https://www.reddit.com/r/electricvehicles/comm...,General Questions and Purchasing Advice Thread...
3,electricvehicles,EV Batteries Got 20% Cheaper Last Year,,2025-06-02 13:55:19,97,13,https://insideevs.com/news/761338/ev-battery-c...,EV Batteries Got 20% Cheaper Last Year
4,electricvehicles,Thoughts on Automakers having control over you...,There has been a recent uproar regarding Zeekr...,2025-06-02 09:47:23,8,17,https://www.reddit.com/r/electricvehicles/comm...,Thoughts on Automakers having control over you...


In [73]:
# Remove capital letters, urls, lines, ampersands, emojis, symbols for text analysis
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  
    text = re.sub(r'\n', ' ', text) 
    text = re.sub(r'&amp;', '&', text) 
    text = re.sub(r'\[.*?\]\(.*?\)', '', text) 
    text = re.sub(r'>+', '', text)  
    text = re.sub(r'[^\x00-\x7F]+', '', text) 
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    text = re.sub(r'\s+', ' ', text) 
    return text.strip()

In [74]:
df["clean_text"] = df["full_text"].apply(clean_text)

In [75]:
df.head()

Unnamed: 0,subreddit,title,selftext,created_utc,upvotes,num_comments,url,full_text,clean_text
0,electricvehicles,Audi Went To China To Build Cars. China Rebuil...,,2025-06-02 14:34:41,5,0,https://insideevs.com/news/761013/china-audi-r...,Audi Went To China To Build Cars. China Rebuil...,audi went to china to build cars china rebuilt...
1,electricvehicles,What can we actually do with EV batteries once...,"Hey everyone,\n\nI’m exploring the idea of reu...",2025-06-02 12:41:52,2,9,https://www.reddit.com/r/electricvehicles/comm...,What can we actually do with EV batteries once...,what can we actually do with ev batteries once...
2,electricvehicles,General Questions and Purchasing Advice Thread...,"**Need help choosing an EV, finding a home cha...",2025-06-02 14:00:58,1,0,https://www.reddit.com/r/electricvehicles/comm...,General Questions and Purchasing Advice Thread...,general questions and purchasing advice thread...
3,electricvehicles,EV Batteries Got 20% Cheaper Last Year,,2025-06-02 13:55:19,97,13,https://insideevs.com/news/761338/ev-battery-c...,EV Batteries Got 20% Cheaper Last Year,ev batteries got 20 cheaper last year
4,electricvehicles,Thoughts on Automakers having control over you...,There has been a recent uproar regarding Zeekr...,2025-06-02 09:47:23,8,17,https://www.reddit.com/r/electricvehicles/comm...,Thoughts on Automakers having control over you...,thoughts on automakers having control over you...


In [76]:
# Drop any rows with shorter text 
df = df[df["clean_text"].str.len() > 20]

In [77]:
df.to_csv("reddit_ev_posts_cleaned_df.csv", index=False)
print("Cleaned data")

Cleaned data


## Brand Mapping (Enrichment)
Tagging each post with a likely brand, based on keyword matching.

In [82]:
brand_keywords = {
    "mercedes": ["mercedes", "benz", "eqs", "eqe", "eqb"],
    "tesla": ["tesla", "model s", "model 3", "model x", "model y", "cybertruck"],
    "bmw": ["bmw", "ix", "i4", "i3"],
    "volkswagen": ["volkswagen", "vw", "id4", "id.4", "id3", "id.3"],
    "byd": ["byd", "atto", "seal", "dolphin"],
    "generic_ev": ["ev", "electric vehicle", "electric car"]
}

In [83]:
def tag_brand(text):
    for brand, keywords in brand_keywords.items():
        for kw in keywords:
            if kw in text:
                return brand
    return "unknown"

In [84]:
df["brand"] = df["clean_text"].apply(tag_brand)

In [85]:
df.to_csv("reddit_ev_posts_cleaned_df.csv", index=False)
print("Brand tag data saved")

Brand tag data saved
