<a href="https://colab.research.google.com/github/arashkol/python_class/blob/main/wikipedia_click.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdflib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdflib
  Downloading rdflib-6.2.0-py3-none-any.whl (500 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m500.3/500.3 KB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting isodate
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.1 rdflib-6.2.0


In [None]:
from curses.ascii import isascii
import requests
from rdflib import Graph, Namespace, Literal, RDF, URIRef
import gzip
import re
from urllib.parse import quote
import pandas as pd


In [None]:
response = requests.get("https://dumps.wikimedia.org/other/clickstream/2023-01/clickstream-zhwiki-2023-01.tsv.gz")


In [None]:
data = gzip.decompress(response.content).decode("utf-8")

In [None]:
def good_string(iri_string):
    # Check if the IRI string contains illegal characters
    if re.search(r"[^\u0020-\u007E\u00A0-\u00FF\u0100-\u1FFF\u2C00-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD]", iri_string):
        return False

    if (iri_string == None): return False
    # Check if the IRI string is properly encoded in UTF-8
    try:
        iri_string.encode("utf-8")
    except UnicodeEncodeError:
        return False

    return True

In [None]:
def is_number(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

In [None]:
def has_only_ascii(string):
    return all(char.isascii() for char in string)

In [None]:
df = pd.DataFrame(columns=['from','clicks','to'])

In [None]:
for line in data.split("\n"):
   # ll += 1
    #if ll > 750000 : break
    # Split the line into columns
    columns = line.split("\t")

    # Skip the header row
    if columns[0] == "source" or len(columns)<2 \
    or columns[0] == "other-empty" \
    or "other-search" in columns[0] or "other-internal" in columns[0] or "other-external" in columns[0]\
    or "other-empty" in columns[1]\
    or "other-search" in columns[1] or "other-internal" in columns[1] or "other-external" in columns[1]\
    or "other-empty" in columns[2]\
    or "other-search" in columns[2] or  "other-internal" in columns[2] or "other-external" in columns[2]\
    or not is_number(columns[3]) or not has_only_ascii(columns[0])or not has_only_ascii(columns[1]):
        continue

    source = quote(columns[0])
    
    target = quote(columns[1])
    clicks = columns[3]

    if good_string(source) and good_string(target):
        df = df.append({"from":source, "clicks":clicks, "to":target},ignore_index=True)

In [138]:
df.head()

Unnamed: 0,from,clicks,to
0,WhatsApp,12,Telegram
1,TG,10,Telegram
2,ALICESOFT,10,DARCROWS
3,Hide,15,Hyde
4,VAMPS,24,Hyde


# Look if "Facebook" is in the "from" nodes:

In [139]:
if "Facebook" in list(df["from"]):
  print("Facebook is in the \"from\" list")
else:
  print("Facebook is not in the \"from\" list")

Facebook is in the "from" list


#Check if an arbitrary string with the name "from_node" is in "from" nodes

In [140]:
from_node = "WhatsApp"

if from_node in list(df["from"]):
  print(f"{from_node} is in the \"from\" list")
else:
  print(f"{from_node} is not in the \"from\" list")

WhatsApp is in the "from" list


In [141]:
# Using isin() function:
from_node = "WhatsApp"

if df["from"].isin([from_node]).any():
  print(f"{from_node} is in the \"from\" list")
else:
  print(f"{from_node} is not in the \"from\" list")

WhatsApp is in the "from" list


In [142]:
df["from"].isin([from_node])

0        True
1       False
2       False
3       False
4       False
        ...  
3884    False
3885    False
3886    False
3887    False
3888    False
Name: from, Length: 3889, dtype: bool

# if, then, else and elif

Given the index of a row in the dataframe, say if the connection between "from" and "two" is strong based on:

**clicks <= 10 -> Weak connection**

**10 < clicks <=  20  -> Normal connection**

**50 < clicks -> Strong connection**

In [143]:
row_index = 1000

row = df.iloc[row_index]

clicks = int(row[1])

if clicks <= 10:
  strength = "low"
if clicks > 10 and clicks <= 20:
  strength = "normal"
if clicks > 50:
  strength = "strong"

print(f"The connection between {row[0]} and {row[2]} is {strength}")

The connection between MIXNINE and ONF is normal


#for loop
Apply the same procedure to the first 10 rows of the DataFrame

In [144]:
for row_index in range(0,10):

    row = df.iloc[row_index]

    clicks = int(row[1])

    if clicks <= 10:
        strength = "low"
    elif clicks > 10 and clicks <= 20:
        strength = "normal"
    elif clicks > 20:
        strength = "strong"

    print(f"In row {row_index} The connection between {row[0]} and {row[2]} is {strength}")

In row 0 The connection between WhatsApp and Telegram is normal
In row 1 The connection between TG and Telegram is low
In row 2 The connection between ALICESOFT and DARCROWS is low
In row 3 The connection between Hide and Hyde is normal
In row 4 The connection between VAMPS and Hyde is strong
In row 5 The connection between GLAMOROUS_SKY and Hyde is normal
In row 6 The connection between YOLO and Memento_mori is normal
In row 7 The connection between ClariS and Irony is strong
In row 8 The connection between Nusinersen and Onasemnogene_abeparvovec is normal
In row 9 The connection between Battle_Spirits and Battle_Spirits_Brave is strong


#**pass** command in loops
If the number of clicks is bigger than 20, do nothing, but keep the space in program to make decision later about what should be done in this case.

In [145]:
for row_index in range(0,10):

    row = df.iloc[row_index]

    clicks = int(row[1])

    if clicks <= 10:
        strength = "low"
    elif clicks > 10 and clicks <= 20:
        strength = "normal"
    elif clicks > 20:
        pass #do nothing

    print(f"In row {row_index} The connection between {row[0]} and {row[2]} is {strength}")
#find the bug... ;)

In row 0 The connection between WhatsApp and Telegram is normal
In row 1 The connection between TG and Telegram is low
In row 2 The connection between ALICESOFT and DARCROWS is low
In row 3 The connection between Hide and Hyde is normal
In row 4 The connection between VAMPS and Hyde is normal
In row 5 The connection between GLAMOROUS_SKY and Hyde is normal
In row 6 The connection between YOLO and Memento_mori is normal
In row 7 The connection between ClariS and Irony is normal
In row 8 The connection between Nusinersen and Onasemnogene_abeparvovec is normal
In row 9 The connection between Battle_Spirits and Battle_Spirits_Brave is normal


# **continue** command in loops
Just in case the "from" columns is equal to "Hide" don't do anything and continue tracing the dataset. 

In [146]:
for row_index in range(0,10):

    row = df.iloc[row_index]

    if row[0] == "Hide":
      continue

    clicks = int(row[1])

    if clicks <= 10:
        strength = "low"
    elif clicks > 10 and clicks <= 20:
        strength = "normal"
    elif clicks > 20:
        strength = "strong"

    print(f"In row {row_index} The connection between {row[0]} and {row[2]} is {strength}")

In row 0 The connection between WhatsApp and Telegram is normal
In row 1 The connection between TG and Telegram is low
In row 2 The connection between ALICESOFT and DARCROWS is low
In row 4 The connection between VAMPS and Hyde is strong
In row 5 The connection between GLAMOROUS_SKY and Hyde is normal
In row 6 The connection between YOLO and Memento_mori is normal
In row 7 The connection between ClariS and Irony is strong
In row 8 The connection between Nusinersen and Onasemnogene_abeparvovec is normal
In row 9 The connection between Battle_Spirits and Battle_Spirits_Brave is strong


#**break** command in loops
look of the first occurance of the word "YOLO", report the index and stop the search.

In [149]:
search_term = "YOLO"

for row_index in range(0,10):

    row = df.iloc[row_index]

    if search_term in list(row):
      print(search_term+f" was found at the index {row_index}")
      break

YOLO was found at the index 6


#**While** loop
Gather a sum of 100 clicks in a list of (from, clicks, to) toupls. The sum could be more than 100, just in case by removing one of the elements the sum will be less than 100.

In [155]:
sum_clicks = 0
row_index = 0

while sum_clicks<100:

  row = df.iloc[row_index]
  row_index += 1
  clicks = int(row[1])

  sum_clicks += clicks

print(f"Sum of {sum_clicks} clicks was met at the index {row_index}")


Sum of 131 clicks was met at the index 8
