In [1]:
# imports
import json
import gzip
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import Ridge

In [2]:
def read_json(data_path: str) -> list:
    with gzip.open(data_path, 'rt', encoding='utf-8') as f:
        return json.load(f)

In [3]:
def open_json(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        try:
            data = json.load(file)
            print("JSON is valid.")
        except json.JSONDecodeError as e:
            print(f"Invalid JSON: {e}")
    return data

In [4]:
data = open_json("data/rtvslo_train.json")

JSON is valid.


In [5]:
data[0]

{'url': 'https://www.rtvslo.si/sport/kosarka/liga-nba/v-javnost-prisel-nov-posnetek-moranta-s-pistolo/668124',
 'authors': ['T. J.'],
 'date': '2023-05-14T18:17:00',
 'title': 'V javnost prišel nov posnetek Moranta s pištolo',
 'paragraphs': ['Moštvo iz Tennesseeja je ob tem zapisalo, da je Morant suspendiran z vseh klubskih dejavnosti, do preiskave disciplinske komisije Lige NBA pa dogodka ne bo komentiralo. Video je bil sicer medtem že izbrisan. Na njem se Morant vozi na sovoznikovem sedežu, kot je videti, pa v levi roki drži pištolo. Kdaj je posnetek nastal, sicer ni jasno.',
  'Marca je bil že kaznovan, potem ko je objavil posnetek iz nočnega lokala, na njem pa ima v rokah pištolo. Posnetek je nastal v Denverju, kjer je gostoval z Grizliji. Ker pištola ni bila njegova, jo je po preiskavi lige in obljubi, da bo poiskal profesionalno pomoč, odnesel z blažjo kaznijo osmih tekem prepovedi. Komisar lige Adam Silver je tedaj njegovo dejanje označil za neodgovorno, nepremišljeno in nevarn

In [6]:
tii = data[1] # test iteration instance
datetime.strptime(tii["date"].split("T")[0], "%Y-%m-%d").strftime("%A")
tii["date"].split("T")[1].split(":")[0]
tii["url"].split("/")
# tii["topics"]
" ".join(tii["authors"])
np.sqrt(tii["n_comments"])

6.244997998398398

In [7]:
i = 0
for a in data:
    if "topics" not in a.keys():
        continue
        print(i, ":", a)
    elif a["topics"] != a["url"].split("/")[3]:
        print(i, ":", "'topics' in topic v url se ne ujemata!")
        print("'{}' :: '{}'".format(a["topics"], a["url"].split("/")[3]))
        print(a)
    i+=1

#### Baseline

In [8]:
# sestavimo prvi del podatkov - dan v tednu, ura, avtorji, topic, subtopic
data_base_part1 = []

for article in data: # iteriramo cez vse clanke

    # samo datum, kot string
    date_string = article["date"].split("T")[0]
    # dan v tednu iz datuma
    day_of_week = datetime.strptime(date_string, "%Y-%m-%d").strftime("%A")

    # (samo) ura objave članka
    hour = article["date"].split("T")[1].split(":")[0]

    # avtorji
    authors = " ".join(article["authors"]) # kar zdruzimo v string, da bo encoder lahko sprejel

    #topic
    if "topics" in article.keys():
        # ce ima topic, ga kar direktno vzamemo
        topic = article["topics"] # sem preveril, da je vedno isto, kot v url... - ziher je ziher
    else:
        # ce manjka topic, ga vzamemo iz url-ja
        topic = article["url"].split("/")[3]

    #subtopic (samo prvi)
    subtopic = article["url"].split("/")[4]

    # sestavimo vrstico
    new_row = [
        day_of_week,
        hour,
        authors,
        topic,
        subtopic
    ]

    data_base_part1.append(new_row)


# in damo v one-hot encoding
onehotenc = OneHotEncoder(handle_unknown="ignore")
data_base_part1_enc = onehotenc.fit_transform(data_base_part1)

In [9]:
data_base_part1_enc

<20981x6069 sparse matrix of type '<class 'numpy.float64'>'
	with 104905 stored elements in Compressed Sparse Row format>

In [10]:
print(data_base_part1_enc)

  (0, 3)	1.0
  (0, 25)	1.0
  (0, 1041)	1.0
  (0, 1310)	1.0
  (0, 2646)	1.0
  (1, 0)	1.0
  (1, 20)	1.0
  (1, 638)	1.0
  (1, 1309)	1.0
  (1, 2551)	1.0
  (2, 2)	1.0
  (2, 17)	1.0
  (2, 623)	1.0
  (2, 1307)	1.0
  (2, 1508)	1.0
  (3, 3)	1.0
  (3, 26)	1.0
  (3, 704)	1.0
  (3, 1310)	1.0
  (3, 3423)	1.0
  (4, 6)	1.0
  (4, 15)	1.0
  (4, 902)	1.0
  (4, 1307)	1.0
  (4, 1819)	1.0
  :	:
  (20976, 6)	1.0
  (20976, 24)	1.0
  (20976, 852)	1.0
  (20976, 1313)	1.0
  (20976, 2069)	1.0
  (20977, 6)	1.0
  (20977, 17)	1.0
  (20977, 690)	1.0
  (20977, 1310)	1.0
  (20977, 5032)	1.0
  (20978, 6)	1.0
  (20978, 27)	1.0
  (20978, 1096)	1.0
  (20978, 1312)	1.0
  (20978, 1944)	1.0
  (20979, 6)	1.0
  (20979, 23)	1.0
  (20979, 1256)	1.0
  (20979, 1307)	1.0
  (20979, 2587)	1.0
  (20980, 6)	1.0
  (20980, 29)	1.0
  (20980, 286)	1.0
  (20980, 1310)	1.0
  (20980, 2646)	1.0


In [11]:
# sestavimo se drugo del podatkov - title in besedilo
data_base_part2 = []

for article in data: # iteriramo cez vse clanke

    # konkateniramo naslov in vse odstavke
    new_row = article["title"] + " " + " ".join(article["paragraphs"])

    data_base_part2.append(new_row)


# in vektoriziramo
vectorizer = TfidfVectorizer()
data_base_part2_vect = vectorizer.fit_transform(data_base_part2)

In [12]:
data_base_part2_vect

<20981x276525 sparse matrix of type '<class 'numpy.float64'>'
	with 5867478 stored elements in Compressed Sparse Row format>

In [13]:
# zdruzimo oba dela
data_base = hstack([data_base_part1_enc, data_base_part2_vect])

In [14]:
data_base

<20981x282594 sparse matrix of type '<class 'numpy.float64'>'
	with 5972383 stored elements in Compressed Sparse Row format>

In [15]:
# se y oz. stevila komentarjev
ground_truth_base = []

for article in data:

    # korenimo st komentarjev
    new_row = np.sqrt(article["n_comments"])

    ground_truth_base.append(new_row)

In [16]:
ground_truth_base

[8.774964387392123,
 6.244997998398398,
 1.0,
 1.4142135623730951,
 2.6457513110645907,
 2.8284271247461903,
 0.0,
 5.656854249492381,
 1.7320508075688772,
 4.358898943540674,
 4.242640687119285,
 2.23606797749979,
 8.246211251235321,
 6.244997998398398,
 11.916375287812984,
 18.841443681416774,
 26.267851073127396,
 0.0,
 18.110770276274835,
 16.0312195418814,
 5.477225575051661,
 4.123105625617661,
 3.3166247903554,
 5.0,
 13.564659966250536,
 12.569805089976535,
 3.1622776601683795,
 16.97056274847714,
 6.855654600401044,
 3.605551275463989,
 3.4641016151377544,
 6.928203230275509,
 10.63014581273465,
 1.0,
 18.49324200890693,
 6.244997998398398,
 0.0,
 5.744562646538029,
 2.449489742783178,
 7.937253933193772,
 2.0,
 3.0,
 1.0,
 3.4641016151377544,
 8.0,
 0.0,
 2.8284271247461903,
 1.4142135623730951,
 7.14142842854285,
 2.0,
 4.58257569495584,
 1.0,
 8.12403840463596,
 1.0,
 26.229754097208,
 2.23606797749979,
 5.0,
 1.4142135623730951,
 4.123105625617661,
 12.727922061357855,
 0.

In [17]:
# model
model = Ridge()
model.fit(data_base, ground_truth_base)

In [18]:
# pripravimo test set na isti nacin

test = open_json("data/rtvslo_test.json")
test_base_part1 = []
test_base_part2 = []


for article in test:

    # samo datum, kot string
    date_string = article["date"].split("T")[0]
    # dan v tednu iz datuma
    day_of_week = datetime.strptime(date_string, "%Y-%m-%d").strftime("%A")

    # (samo) ura objave članka
    hour = article["date"].split("T")[1].split(":")[0]

    # avtorji
    authors = " ".join(article["authors"]) # kar zdruzimo v string, da bo encoder lahko sprejel

    #topic
    if "topics" in article.keys():
        # ce ima topic, ga kar direktno vzamemo
        topic = article["topics"] # sem preveril, da je vedno isto, kot v url... - ziher je ziher
    else:
        # ce manjka topic, ga vzamemo iz url-ja
        topic = article["url"].split("/")[3]

    #subtopic (samo prvi)
    subtopic = article["url"].split("/")[4]

    # sestavimo vrstico
    new_row1 = [
        day_of_week,
        hour,
        authors,
        topic,
        subtopic
    ]

    test_base_part1.append(new_row1)

    # konkateniramo naslov in vse odstavke
    new_row2 = article["title"] + " " + " ".join(article["paragraphs"])

    test_base_part2.append(new_row2)

# uporabimo isti encoder in vectorizer
test_base_part1_enc = onehotenc.transform(test_base_part1)
test_base_part2_vect = vectorizer.transform(test_base_part2)
test_base = hstack([test_base_part1_enc, test_base_part2_vect])

JSON is valid.


In [19]:
test_base_part1_enc

<568x6069 sparse matrix of type '<class 'numpy.float64'>'
	with 2718 stored elements in Compressed Sparse Row format>

In [20]:
test_base_part2_vect

<568x276525 sparse matrix of type '<class 'numpy.float64'>'
	with 166088 stored elements in Compressed Sparse Row format>

In [21]:
test_base

<568x282594 sparse matrix of type '<class 'numpy.float64'>'
	with 168806 stored elements in Compressed Sparse Row format>

In [22]:
# predict
predictions_base = model.predict(test_base)

# potenciranje, da dobimo iz korenov nazaj st. komentarjev
predictions_submit = []
for p in predictions_base:
    predictions_submit.append(p ** 2)

# and save
np.savetxt('predictions.txt', predictions_submit)