### Importing the data from github into a dataframe

In [2]:
import os
import pandas as pd

# Function to list all text files in a directory
def list_text_files(directory):
    text_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.startswith("part-"):
                text_files.append(os.path.join(root, file))
    return text_files

# Function to read text data and store in DataFrame
def create_dataframe_from_text_files(text_files):
    data = []
    for file in text_files:
        with open(file, 'r', encoding='utf-8') as f:  # Adjust encoding if needed
            text = f.read()
            data.append({'File': file, 'Text': text})
    df = pd.DataFrame(data)
    return df

# Specify the parent directory containing all subdirectories with text files
#parent_directory = 'C:/Users/bryce/Documents/@ Education/KUL/Year 2 Semester 2/Advanced analytics for business/Advanced_Analytics_2024/Assignment_3/datasets'
parent_directory = 'C:/Users/lenne/anaconda3/envs/AA/Advanced_Analytics/Assignment_3/spark/datasets'

# List all subdirectories within the parent directory
all_subdirectories = [os.path.join(parent_directory, name) for name in os.listdir(parent_directory) if os.path.isdir(os.path.join(parent_directory, name))]

# List all text files in all subdirectories
all_text_files = []
for subdirectory in all_subdirectories:
    text_files = list_text_files(subdirectory)
    all_text_files.extend(text_files)

# Create DataFrame from text files
text_df = create_dataframe_from_text_files(all_text_files)

# Display DataFrame
print(text_df.head())


                                                File  \
0  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
1  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
2  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
3  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
4  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   

                                                Text  
0  {"aid": "39958086", "title": "Large Hadron Col...  
1  {"aid": "39958094", "title": "An editor for ma...  
2  {"aid": "39958109", "title": "You shouldn't ho...  
3  {"aid": "39958127", "title": "Isaac Asimov obi...  
4  {"aid": "39958129", "title": "Do people genera...  


In [3]:
print(text_df)

                                                   File  \
0     C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
1     C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
2     C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
3     C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
4     C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
...                                                 ...   
5742  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
5743  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
5744  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
5745  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   
5746  C:/Users/lenne/anaconda3/envs/AA/Advanced_Anal...   

                                                   Text  
0     {"aid": "39958086", "title": "Large Hadron Col...  
1     {"aid": "39958094", "title": "An editor for ma...  
2     {"aid": "39958109", "title": "You shouldn't ho...  
3     {"aid": "39958127", "title": "Isaac Asimov obi...  
4

In [4]:
text_only_df = text_df["Text"]
print(text_only_df)

0       {"aid": "39958086", "title": "Large Hadron Col...
1       {"aid": "39958094", "title": "An editor for ma...
2       {"aid": "39958109", "title": "You shouldn't ho...
3       {"aid": "39958127", "title": "Isaac Asimov obi...
4       {"aid": "39958129", "title": "Do people genera...
                              ...                        
5742    {"aid": "40105454", "title": "The Difference B...
5743    {"aid": "40105465", "title": "Where the Bitter...
5744    {"aid": "40105482", "title": "Makefile-graph: ...
5745    {"aid": "40105498", "title": "Online dating sp...
5746    {"aid": "40105510", "title": "Everything I Kno...
Name: Text, Length: 5747, dtype: object


In [5]:
import json

def parse_json(text):
    # Convert JSON-like string to dictionary
    data = json.loads(text)
    # Convert dictionary to pandas Series
    return pd.Series(data)



In [6]:
# Apply the function to the "Text" column and concatenate the result
result_df = text_only_df.apply(parse_json)

# Display the result
print(result_df)

           aid                                              title  \
0     39958086  Large Hadron Collider reaches its first stable...   
1     39958094  An editor for making wireframes with a pastebi...   
2     39958109           You shouldn't host your own email server   
3     39958127        Isaac Asimov obituary – Brian Aldiss (1992)   
4     39958129  Do people generally agree with Shaoshan Liu an...   
...        ...                                                ...   
5742  40105454  The Difference Between Startup Valuation and R...   
5743  40105465                       Where the Bitter Lesson Ends   
5744  40105482  Makefile-graph: Parse Make's internal database...   
5745  40105498  Online dating spells the end of Britain's lone...   
5746  40105510  Everything I Know About Creating Buzz, I Learn...   

                                                    url  \
0     https://home.cern/news/news/accelerators/large...   
1                                 https://www.webma.s

In [7]:
print(result_df.columns)

Index(['aid', 'title', 'url', 'domain', 'votes', 'user', 'posted_at',
       'comments', 'source_title', 'source_text', 'frontpage'],
      dtype='object')


In [12]:
# Export result_df to a CSV file
#result_df.to_csv('parsed_data.csv', index=False)


#### Saving the data

In [15]:
# Into CSV file
#result_df.to_csv('data_full.csv', index=False)


In [8]:
# Into JSON file
result_df.to_json('data_full.json', orient='records')

### Analysis

In [17]:
data = pd.read_csv('data_full.csv')
data.shape

(5747, 11)