### Importing the data from github into a dataframe

In [3]:
import os
import pandas as pd

# Function to list all text files in a directory
def list_text_files(directory):
    text_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.startswith("part-"):
                text_files.append(os.path.join(root, file))
    return text_files

# Function to read text data and store in DataFrame
def create_dataframe_from_text_files(text_files):
    data = []
    for file in text_files:
        with open(file, 'r', encoding='utf-8') as f:  # Adjust encoding if needed
            text = f.read()
            data.append({'File': file, 'Text': text})
    df = pd.DataFrame(data)
    return df

# Specify the parent directory containing all subdirectories with text files
parent_directory = 'C:/Users/bryce/Documents/@ Education/KUL/Year 2 Semester 2/Advanced analytics for business/Advanced_Analytics_2024/Assignment_3/datasets'

# List all subdirectories within the parent directory
all_subdirectories = [os.path.join(parent_directory, name) for name in os.listdir(parent_directory) if os.path.isdir(os.path.join(parent_directory, name))]

# List all text files in all subdirectories
all_text_files = []
for subdirectory in all_subdirectories:
    text_files = list_text_files(subdirectory)
    all_text_files.extend(text_files)

# Create DataFrame from text files
text_df = create_dataframe_from_text_files(all_text_files)

# Display DataFrame
print(text_df.head())


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


                                                File  \
0  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
1  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
2  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
3  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
4  C:/Users/bryce/Documents/@ Education/KUL/Year ...   

                                                Text  
0  {"aid": "39958086", "title": "Large Hadron Col...  
1  {"aid": "39958094", "title": "An editor for ma...  
2  {"aid": "39958109", "title": "You shouldn't ho...  
3  {"aid": "39958127", "title": "Isaac Asimov obi...  
4  {"aid": "39958129", "title": "Do people genera...  


In [4]:
print(text_df)

                                                   File  \
0     C:/Users/bryce/Documents/@ Education/KUL/Year ...   
1     C:/Users/bryce/Documents/@ Education/KUL/Year ...   
2     C:/Users/bryce/Documents/@ Education/KUL/Year ...   
3     C:/Users/bryce/Documents/@ Education/KUL/Year ...   
4     C:/Users/bryce/Documents/@ Education/KUL/Year ...   
...                                                 ...   
5742  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
5743  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
5744  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
5745  C:/Users/bryce/Documents/@ Education/KUL/Year ...   
5746  C:/Users/bryce/Documents/@ Education/KUL/Year ...   

                                                   Text  
0     {"aid": "39958086", "title": "Large Hadron Col...  
1     {"aid": "39958094", "title": "An editor for ma...  
2     {"aid": "39958109", "title": "You shouldn't ho...  
3     {"aid": "39958127", "title": "Isaac Asimov obi...  
4

In [5]:
text_only_df = text_df["Text"]
print(text_only_df)

0       {"aid": "39958086", "title": "Large Hadron Col...
1       {"aid": "39958094", "title": "An editor for ma...
2       {"aid": "39958109", "title": "You shouldn't ho...
3       {"aid": "39958127", "title": "Isaac Asimov obi...
4       {"aid": "39958129", "title": "Do people genera...
                              ...                        
5742    {"aid": "40105454", "title": "The Difference B...
5743    {"aid": "40105465", "title": "Where the Bitter...
5744    {"aid": "40105482", "title": "Makefile-graph: ...
5745    {"aid": "40105498", "title": "Online dating sp...
5746    {"aid": "40105510", "title": "Everything I Kno...
Name: Text, Length: 5747, dtype: object


In [6]:
import json

def parse_json(text):
    # Convert JSON-like string to dictionary
    data = json.loads(text)
    # Convert dictionary to pandas Series
    return pd.Series(data)



In [7]:
# Apply the function to the "Text" column and concatenate the result
result_df = text_only_df.apply(parse_json)

# Display the result
print(result_df)

           aid                                              title  \
0     39958086  Large Hadron Collider reaches its first stable...   
1     39958094  An editor for making wireframes with a pastebi...   
2     39958109           You shouldn't host your own email server   
3     39958127        Isaac Asimov obituary – Brian Aldiss (1992)   
4     39958129  Do people generally agree with Shaoshan Liu an...   
...        ...                                                ...   
5742  40105454  The Difference Between Startup Valuation and R...   
5743  40105465                       Where the Bitter Lesson Ends   
5744  40105482  Makefile-graph: Parse Make's internal database...   
5745  40105498  Online dating spells the end of Britain's lone...   
5746  40105510  Everything I Know About Creating Buzz, I Learn...   

                                                    url  \
0     https://home.cern/news/news/accelerators/large...   
1                                 https://www.webma.s

In [8]:
print(result_df.columns)

Index(['aid', 'title', 'url', 'domain', 'votes', 'user', 'posted_at',
       'comments', 'source_title', 'source_text', 'frontpage'],
      dtype='object')


In [9]:
# Export result_df to a CSV file
result_df.to_csv('parsed_data.csv', index=False)


#### Saving the data

In [10]:
# Into CSV file
result_df.to_csv('data_full.csv', index=False)


In [11]:
# Into JSON file
result_df.to_json('data_full.json', orient='records')

### Analysis

In [12]:
data = pd.read_csv('data_full.csv')
data.shape

(5747, 11)

In [13]:
print(data)

           aid                                              title  \
0     39958086  Large Hadron Collider reaches its first stable...   
1     39958094  An editor for making wireframes with a pastebi...   
2     39958109           You shouldn't host your own email server   
3     39958127        Isaac Asimov obituary – Brian Aldiss (1992)   
4     39958129  Do people generally agree with Shaoshan Liu an...   
...        ...                                                ...   
5742  40105454  The Difference Between Startup Valuation and R...   
5743  40105465                       Where the Bitter Lesson Ends   
5744  40105482  Makefile-graph: Parse Make's internal database...   
5745  40105498  Online dating spells the end of Britain's lone...   
5746  40105510  Everything I Know About Creating Buzz, I Learn...   

                                                    url  \
0     https://home.cern/news/news/accelerators/large...   
1                                 https://www.webma.s

#### Frontpage

In [14]:
# Display the first few rows of the "frontpage" variable
print(data['frontpage'].head())


0    False
1    False
2    False
3    False
4    False
Name: frontpage, dtype: bool


In [15]:
from sklearn.model_selection import train_test_split

# Binarize the 'frontpage' variable
data['frontpage'] = data['frontpage'].astype(int)  # Convert True/False to 1

In [16]:
# Check if correct
print(data['frontpage'].head())

0    0
1    0
2    0
3    0
4    0
Name: frontpage, dtype: int32


In [17]:
# Calculate the number of occurrences of each class in the 'frontpage' variable
frontpage_counts = data['frontpage'].value_counts()

# Print the counts
print("Counts of False and True in 'frontpage' variable:")
print(frontpage_counts)


Counts of False and True in 'frontpage' variable:
frontpage
0    4764
1     983
Name: count, dtype: int64


#### Comments

In [18]:
# Display the first few rows of the "comments" variable
print(data['comments'])

0       0
1       0
2       0
3       0
4       0
       ..
5742    0
5743    0
5744    0
5745    0
5746    0
Name: comments, Length: 5747, dtype: int64


In [19]:
# Calculate the number of occurrences of each class in the 'frontpage' variable
comments_counts = data['comments'].value_counts()

# Print the counts
print("Counts of comments:")
print(comments_counts)


Counts of comments:
comments
0      5219
2       203
3        67
4        42
5        26
6        23
8        19
7        17
9        17
11       13
12       11
13        8
16        7
14        6
17        6
15        5
19        5
18        5
10        4
28        3
35        3
22        3
20        3
21        2
88        2
26        2
24        2
23        1
76        1
105       1
33        1
57        1
83        1
63        1
142       1
67        1
46        1
40        1
38        1
82        1
45        1
36        1
25        1
51        1
89        1
49        1
34        1
44        1
29        1
92        1
48        1
Name: count, dtype: int64


In [20]:
unique_comments = data['comments'].unique()

# Print the unique values
print("Unique values of 'comments':", unique_comments)

Unique values of 'comments': [  0  11   2   3   4   7  17   9   8   6  10  14  19   5  15  12  33  40
  16  38  18  89  92  23  82  20  29  44  34  49  22  51  26  25  36  45
  24  35  21  13  46  67 142  63  83  57  28  88 105  76  48]


#### Posted_at

In [21]:
# Display the first few rows of the "posted_at" variable
print(data['posted_at'])

0       2024-04-07 03:57:32
1       2024-04-07 03:59:55
2       2024-04-07 04:02:29
3       2024-04-07 04:07:15
4       2024-04-07 04:07:32
               ...         
5742    2024-04-21 13:04:36
5743    2024-04-21 13:05:50
5744    2024-04-21 13:07:23
5745    2024-04-21 13:08:44
5746    2024-04-21 13:10:13
Name: posted_at, Length: 5747, dtype: object


I woud like to convert "posted_at" into two separate variables. Day: A numeric variable counting the days from the earliest time measure. Time: time difference with first post at every day

In [22]:
# Convert 'posted_at' to datetime format
data['posted_at'] = pd.to_datetime(data['posted_at'])

# Extract 'Day' and 'Time' components
data['Day'] = data['posted_at'].dt.date
data['Time'] = data['posted_at'].dt.time

# Print the DataFrame to check the changes
print(data[['posted_at', 'Day', 'Time']].head())


            posted_at         Day      Time
0 2024-04-07 03:57:32  2024-04-07  03:57:32
1 2024-04-07 03:59:55  2024-04-07  03:59:55
2 2024-04-07 04:02:29  2024-04-07  04:02:29
3 2024-04-07 04:07:15  2024-04-07  04:07:15
4 2024-04-07 04:07:32  2024-04-07  04:07:32


In [23]:
#print(data.head())
print(data.columns)

Index(['aid', 'title', 'url', 'domain', 'votes', 'user', 'posted_at',
       'comments', 'source_title', 'source_text', 'frontpage', 'Day', 'Time'],
      dtype='object')


### User

In [24]:
# Display the first few rows of the "user" variable
print(data['user'])

0             Jimmc414
1                  tdk
2             zgin4679
3          thunderbong
4            omnifidus
             ...      
5742          jreacher
5743           oli5679
5744            donatj
5745    helsinkiandrew
5746      priyankanath
Name: user, Length: 5747, dtype: object


In [25]:
unique_users = data['user'].unique()

# Print the unique values
print("Unique values of 'user':", unique_users)

# Calculate the number of unique users
num_unique_users = len(unique_users)
print("Number of unique users:", num_unique_users)

Unique values of 'user': ['Jimmc414' 'tdk' 'zgin4679' ... 'jreacher' 'oli5679' 'donatj']
Number of unique users: 2630


### Votes

In [26]:
# Display the first few rows of the "votes" variable
print(data['votes'])

0       1
1       1
2       1
3       1
4       1
       ..
5742    1
5743    1
5744    1
5745    1
5746    1
Name: votes, Length: 5747, dtype: int64


In [27]:
unique_votes = data['votes'].unique()

# Print the unique values
print("Unique values of 'votes':", unique_votes)

Unique values of 'votes': [  1   7   2  11   8  33  15   6   3   4   5  12  17  13  10  41   9  14
  31  53  27  18  25  22  24  20  38  58  28  30  16  45  39  54  88  70
  40  84  35  29  23  21  56  19 130 103  42  34  69  43  63  44  26  82
 137  47  59  50  85 437  92  36  49  57  32  73  76  48  37 106  65  90]


### Domain

In [28]:
# Display the first few rows of the "domain" variable
print(data['domain'])

0                   home.cern
1                    webma.sh
2                  reddit.com
3             theguardian.com
4                     acm.org
                ...          
5742           crunchbase.com
5743         geohot.github.io
5744        github.com/dnaeon
5745            economist.com
5746    mbrandolph.medium.com
Name: domain, Length: 5747, dtype: object


In [29]:
unique_domain = data['domain'].unique()

# Calculate the number of unique users
num_unique_domain = len(unique_domain)
print("Number of unique domains:", num_unique_domain)

Number of unique domains: 2936


### Missing data

In [30]:
missing_count = data.isnull().sum() 
missing_data = pd.DataFrame({'Column Name': missing_count.index, 'Missing Count': missing_count.values})
missing_data = missing_data.sort_values(by='Missing Count', ascending=False)
missing_data = missing_data[missing_data['Missing Count'] > 0]
missing_data

Unnamed: 0,Column Name,Missing Count
8,source_title,116
9,source_text,5
