# Assignment 2: Text Classification

## Data Preprocessing

In [7]:
import json
import csv
import datetime

# Initialize an empty list to store processed entries
processed_data = []

# Read the JSON file and process each line
with open('News_Category_Dataset_IS_course.json', 'r') as file:
    for line in file:
        # Parse the JSON data for each line
        entry = json.loads(line)

        # Extract relevant information
        link = entry["link"]
        headline = entry["headline"]
        category = entry["category"]
        short_description = entry["short_description"]
        authors = entry["authors"]
        
        # Convert the date from milliseconds to a human-readable format
        date = entry["date"]
        formatted_date = datetime.datetime.utcfromtimestamp(date / 1000.0).strftime('%Y-%m-%d %H:%M:%S')

        # Store the processed entry in the list
        processed_entry = {
            "link": link,
            "headline": headline,
            "category": category,
            "short_description": short_description,
            "authors": authors,
            "date": formatted_date
        }
        processed_data.append(processed_entry)

# Write processed data to a CSV file
csv_file_path = 'processed_data.csv'
fieldnames = ["link", "headline", "category", "short_description", "authors", "date"]

with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write data
    for entry in processed_data:
        writer.writerow(entry)

In [12]:
import pandas as pd

# Read CSV file into a Pandas DataFrame
df = pd.read_csv('processed_data.csv')
df

Unnamed: 0,link,headline,category,short_description,authors,date
0,https://www.huffpost.com/entry/funniest-tweets...,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha...",Elyse Wanshel,2022-09-23 00:00:00
1,https://www.huffpost.com/entry/funniest-parent...,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to...",Caroline Bologna,2022-09-23 00:00:00
2,https://www.huffpost.com/entry/dodgers-basebal...,"Maury Wills, Base-Stealing Shortstop For Dodge...",SPORTS,"Maury Wills, who helped the Los Angeles Dodger...","Beth Harris, AP",2022-09-20 00:00:00
3,https://www.huffpost.com/entry/golden-globes-r...,Golden Globes Returning To NBC In January Afte...,ENTERTAINMENT,"For the past 18 months, Hollywood has effectiv...",,2022-09-20 00:00:00
4,https://www.huffpost.com/entry/biden-us-forces...,Biden Says U.S. Forces Would Defend Taiwan If ...,POLITICS,President issues vow as tensions with China rise.,,2022-09-19 00:00:00
...,...,...,...,...,...,...
148117,https://www.huffingtonpost.com/entry/girl-with...,'Girl With the Dragon Tattoo' India Release Ca...,ENTERTAINMENT,"""Sony Pictures will not be releasing The Girl ...",,2012-01-28 00:00:00
148118,https://www.huffingtonpost.com/entry/maria-sha...,Maria Sharapova Stunned By Victoria Azarenka I...,SPORTS,"Afterward, Azarenka, more effusive with the pr...",,2012-01-28 00:00:00
148119,https://www.huffingtonpost.com/entry/super-bow...,"Giants Over Patriots, Jets Over Colts Among M...",SPORTS,"Leading up to Super Bowl XLVI, the most talked...",,2012-01-28 00:00:00
148120,https://www.huffingtonpost.com/entry/aldon-smi...,Aldon Smith Arrested: 49ers Linebacker Busted ...,SPORTS,CORRECTION: An earlier version of this story i...,,2012-01-28 00:00:00


In [13]:
df.describe()

Unnamed: 0,link,headline,category,short_description,authors,date
count,148122,147388,148122,135938,123706,148122
unique,148096,146295,15,133792,19633,3618
top,https://www.huffingtonpost.comhttp://www.newre...,Sunday Roundup,POLITICS,Welcome to the HuffPost Rise Morning Newsbrief...,Lee Moran,2014-11-05 00:00:00
freq,2,90,35602,191,2058,98


In [11]:
# Check for missing data in each column
missing_data = df.isnull().sum()

# Print the count of missing values for each column
print("Missing Data Summary:")
print(missing_data)

Missing Data Summary:
link                     0
headline               734
category                 0
short_description    12184
authors              24416
date                     0
dtype: int64


We are not removing the data that has missing short_description and author, since they are a big fraction of it.

In [22]:
# Check the dta types of columns
df.dtypes

link                 string[python]
headline             string[python]
category             string[python]
short_description    string[python]
authors              string[python]
date                         object
dtype: object

In [24]:
# Convert everything but date to string
df['link'] = df['link'].astype("string")
df['headline'] = df['headline'].astype("string")
df['category'] = df['category'].astype("string")
df['short_description'] = df['short_description'].astype("string")
df['authors'] = df['authors'].astype("string")

# TODO: eni linki so messy da jih je vec skp zlepljenih idk, tokenization...