In [1]:
from datetime import datetime
from pathlib import Path

import pandas as pd

In [2]:
raw_data = Path.cwd() / "raw"
data = Path.cwd() / "data"

if not data.exists():
    data.mkdir()

In [3]:
def sanitize_month(year_month: str) -> int:
    if year_month.startswith("AF"):
        return datetime.strptime(year_month.split("_")[1], "%b").month
    else:
        return datetime.strptime(year_month.split("_")[0], "%b").month

def transform_raw_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, names=["month_year", "polarity", "score"])
    df["month"] = df["month_year"].apply(sanitize_month)
    df["year"] = path.stem.split("_")[0]
    df["branch"] = path.stem.split("_")[1]
    del df["month_year"]
    return df

In [4]:
dataframes = []

for path in raw_data.glob("*.csv"):
    dataframes.append(transform_raw_data(path))

df = pd.concat(dataframes).reset_index(drop=True)
df

Unnamed: 0,polarity,score,month,year,branch
0,NEU,0.977779,4,2018,AirForce
1,NEU,0.791076,4,2018,AirForce
2,POS,0.898228,4,2018,AirForce
3,NEU,0.982220,4,2018,AirForce
4,POS,0.916744,4,2018,AirForce
...,...,...,...,...,...
162619,NEG,0.728456,10,2022,Army
162620,NEG,0.846738,10,2022,Army
162621,NEU,0.855076,10,2022,Army
162622,NEG,0.931037,10,2022,Army


In [5]:
df.to_csv(data / "sentiment_data.csv")