In [1]:
import pandas as pd
import glob


# 1. Reading files from directory

- remove duplicated contextual texts
- store dataframes into dict


In [8]:
files = glob.glob("us_2024_news.csv/*.csv")
data_columns = ["DateTime","URL","Title","SharingImage","LangCode","DocTone","DomainCountryCode","Location","Lat","Lon","CountryCode","Adm1Code","Adm2Code","GeoType","ContextualText","the_geom","date"]

file_df_dict = {} #key: idx, value: dataframe
file_register = {} # key: idx, value: filename

idx = 0
for fileDir in files:
    fileName = fileDir.split("/")[-1]
    fileName = fileName.split(".")[0]
    file_register[idx] = [fileName,fileDir]

    df = pd.read_csv(fileDir,header=None,low_memory=False,on_bad_lines='skip')
    df.columns = data_columns
    df = df.drop_duplicates("ContextualText")

    file_df_dict[idx] = df

    idx+=1


## 2. Combine all news_data into one df

In [9]:
file_idx_list = list(file_register.keys())
combined_df = pd.DataFrame(columns=data_columns)
for fileIdx in file_idx_list:
    combined_df = pd.concat([combined_df,file_df_dict[fileIdx]],axis=0)
    



In [14]:
combined_df['DateTime'] = pd.to_datetime(combined_df['DateTime'])
combined_df['Date_hourly'] = combined_df['DateTime'].dt.strftime('%Y%m%d%H')



In [16]:
hr_breakdown_df = combined_df.groupby("Date_hourly").size()

## 3. Get sample data by each hour

In [23]:
sampled_news_df = combined_df.groupby('Date_hourly').apply(lambda x: x.sample(n=50, random_state=1)).reset_index(drop=True)


In [77]:
sampled_news_df

Unnamed: 0,DateTime,ContextualText
0,2024-01-01 00:30:00+08:00,will appeal soon to the u s supreme court wher...
1,2024-01-01 00:30:00+08:00,lake wjon news two men were rescued from ann l...
2,2024-01-01 00:30:00+08:00,coach from highland county earning honors jenn...
3,2024-01-01 00:00:00+08:00,after meeting until the board voted to pass th...
4,2024-01-01 00:45:00+08:00,hollywood actress isla fisher who is one of th...
...,...,...
196845,2024-10-18 07:45:00+08:00,students are doing on school issued devices an...
196846,2024-10-18 07:45:00+08:00,violence in our country in america trans and g...
196847,2024-10-18 07:45:00+08:00,argument that she was framed before the trial ...
196848,2024-10-18 07:45:00+08:00,writing postcards during texas state board of ...


In [29]:
#Writing sampled data into 100 different files
# Number of rows in each file
rows_per_file = len(sampled_news_df) // 100  # This will give 100 rows per file if there are exactly 10000 rows

# Save each chunk to a separate file
for i in range(100):
    start_index = i * rows_per_file
    end_index = (i + 1) * rows_per_file if i < 99 else len(sampled_news_df)  # Last file may take the remainder
    chunk = sampled_news_df.iloc[start_index:end_index]
    chunk = chunk[data_columns]
    chunk.to_csv(f'us_2024_news_sampled/sampled_file_{i + 1}.csv', index=False,header=None)

print("DataFrame split into 100 files successfully.")

DataFrame split into 100 files successfully.


## 4. Merge Data source with pagerank score

In [39]:
combined_df["website"] = combined_df["URL"].apply(lambda x:x.split("/")[2])

In [56]:
page_rank_df = pd.read_csv("page_rank_score/page_rank_domains.csv",index_col=0)
# page_rank_df.set_index("domain",inplace=True)
page_rank_df = page_rank_df[["domain","page_rank_decimal"]]
page_rank_df

Unnamed: 0,domain,page_rank_decimal
0,krnb.com,5.21
1,dailytarheel.com,5.29
2,gaytimes.co.uk,5.26
3,lindenlink.com,3.42
4,megatucson.iheart.com,2.87
...,...,...
49,ls-eng.obozrevatel.com,2.71
50,eastendbeacon.com,4.79
51,997kmjj.com,2.77
52,kbtx.com,5.18


In [57]:
combined_df = combined_df.merge(page_rank_df,left_on="website",right_on="domain")

In [63]:
relevant_df = combined_df[["DateTime","URL","Title","DocTone","Location","ContextualText","Date_hourly","website","page_rank_decimal"]]

In [65]:
relevant_df.to_csv("main_us_data.csv",index=False)

In [68]:
relevant_df = relevant_df.dropna(subset=["ContextualText"])
relevant_df.to_csv("main_us_data.csv",index=False)

In [73]:
len(list(set(list(combined_df["website"]))))

7139

In [76]:
combined_df.query("website =='www.beckershospitalreview.com'")

Unnamed: 0,DateTime,URL,Title,SharingImage,LangCode,DocTone,DomainCountryCode,Location,Lat,Lon,...,Adm1Code,Adm2Code,GeoType,ContextualText,the_geom,date,Date_hourly,website,domain,page_rank_decimal
