In [2]:
import pandas as pd

# Define paths to the files
file_paths = {
    'Spanish': 'WPData/es_wiki.txt',
    'French': 'WPData/fr_wiki.txt'
}

# Function to load data into a DataFrame
def load_data(language, file_path):
    # Read the file into a DataFrame
    df = pd.read_csv(file_path, sep=" ", header=None, usecols=[1], names=['User'])
    df['Language'] = language  # Add a column for language
    return df

# Combine data from all files into a single DataFrame
data_frames = [load_data(language, path) for language, path in file_paths.items()]
combined_df = pd.concat(data_frames)

# Group by language and count unique users
unique_users_count = combined_df.groupby('Language')['User'].nunique()

# Display the result
print(unique_users_count)

Language
French      203042
Spanish    2682096
Name: User, dtype: int64


In [4]:
combined_df.sample(5)

Unnamed: 0,User,Language
12728320,88.2.232.122,Spanish
362831,Espilas,Spanish
13490960,Opliwtb,Spanish
5276111,Urdangaray,Spanish
14625330,Feliciano,Spanish


In [1]:
import os

def replace_in_file_comma_spaces(file_path):
    temp_file_path = file_path + '.tmp'  # Create a temporary file name

    with open(file_path, 'r', encoding='utf-8') as file, open(temp_file_path, 'w', encoding='utf-8') as temp_file:
        for line in file:
            temp_file.write(line.replace("', '", "','"))

    os.replace(temp_file_path, file_path)  # Move the temporary file to the original file

#file_path = 'WPData/es_wiki.txt'
#replace_in_file(file_path)

In [2]:
import os
#Update spanish file
def replace_in_file_spaces(file_path):
    temp_file_path = file_path + '.tmp'  # Create a temporary file name

    with open(file_path, 'r', encoding='utf-8') as file, open(temp_file_path, 'w', encoding='utf-8') as temp_file:
        for line in file:
            temp_file.write(line.replace('    ', ' '))

    os.replace(temp_file_path, file_path)  # Move the temporary file to the original file

# Usage
#file_path = 'WPData/es_wiki.txt'
#replace_in_file(file_path)

In [1]:
import pandas as pd
from collections import Counter

# Custom function to parse data from file
def parse_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ')
            title = parts[0]
            user = parts[1]
            timestamp = parts[2]
            categories = parts[3:]
            # Cleaning category data
            categories = [category.strip("[]',") for category in categories]
            data.append([title, user, timestamp, categories])
    return pd.DataFrame(data, columns=['Title', 'User', 'Timestamp', 'Categories'])

# Cleaning function for category lists
def clean_categories(category_list):
    return [category.strip("[]',") for category in category_list.split(',')]

# Count unique users and category occurrences
def count_users_by_category(data):
    category_counts = Counter()
    for _, row in data.iterrows():
        for category in row['Categories']:
            category_counts[category] += 1
    return category_counts

# Paths to the data files
french_file = 'WPData/fr_wiki.txt'
spanish_file = 'WPData/es_wiki.txt'

# Parse data
fr_data = parse_data(french_file)
es_data = parse_data(spanish_file)

# Correcting the parsing of categories
#fr_data['Categories'] = fr_data['Categories'].apply(lambda x: clean_categories(x[0]))
#es_data['Categories'] = es_data['Categories'].apply(lambda x: clean_categories(x[0]))

# Unique users
unique_users_fr = fr_data['User'].nunique()
unique_users_es = es_data['User'].nunique()

# Users per category
category_counts_fr = count_users_by_category(fr_data)
category_counts_es = count_users_by_category(es_data)

# Display results
print("French unique users:", unique_users_fr)
print("Spanish unique users:", unique_users_es)
#print("French category counts:", category_counts_fr)
#print("Spanish category counts:", category_counts_es)


French unique users: 203055
Spanish unique users: 2682096


## First Step to load data in df 

In [6]:
import pandas as pd
import os
# Function to read data and transform into a usable DataFrame
def load_and_prepare_data(file_path):
    # Read the data from the text file with space as a separator
    data = pd.read_csv(file_path, sep=" ", header=None, names=["title", "user", "timestamp", "categories"])
    
    # Convert UNIX timestamps to human-readable datetime
    data['datetime'] = pd.to_datetime(data['timestamp'], unit='s')
    
    #Convert to JSON format to faster implementation
    data['categories'] = data['categories'].str.replace("'", '"')
    
    # Convert string representation of category lists to actual lists
    #data['categories'] = data['categories'].apply(json.loads)
    
    return data

In [8]:
data_fr['categories'] = data_fr['categories'].str.replace("'", '"')

In [9]:
import json

data_fr['categories'] = data_fr['categories'].apply(json.loads)

In [10]:
data_fr.sample(10)

Unnamed: 0,title,user,timestamp,categories,datetime
15908960,CEC_European_Managers,CECmanagers,1220865473,[politica],2008-09-08 09:17:53
5141000,Petropedetidae,Poleta33,1214474527,[naturaleza],2008-06-26 10:02:07
5721407,Hull_City_Association_Football_Club,Axou,1166032856,"[deporte, eventos, geografia, historia, politi...",2006-12-13 18:00:56
11199829,Société_à_2000_watts,Viaferrata,1207924095,"[geografia, historia, naturaleza, politica]",2008-04-11 14:28:15
5856076,Communauté_de_communes_de_Haute_Maurienne-Vanoise,Hmv~frwiki,1256048553,"[derecho, geografia, historia, politica]",2009-10-20 14:22:33
7840783,Transformations_de_Paris_sous_le_Second_Empire,Lilyu,1205864877,"[eventos, geografia, historia, politica]",2008-03-18 18:27:57
6395741,Loix,JarnaQuais,1139476969,"[derecho, geografia, politica]",2006-02-09 09:22:49
6636412,Luis_Guzmán_(acteur),Bulat,1126949155,"[arte, historia]",2005-09-17 09:25:55
14619473,Moinville-la-jeulin,Loveless,1195285541,"[derecho, geografia, politica]",2007-11-17 07:45:41
12816648,Nana_Kitade,Nicorr,1179315962,"[arte, historia]",2007-05-16 11:46:02


# Process french data

In [24]:
#data_fr.to_pickle('converted/df_fr')
data_fr=load_and_prepare_data('WPData/fr_wiki.txt')
len(data_fr)

In [None]:
import json
data_es['categories'] = data_es['categories'].apply(json.loads)

In [11]:
df_exploded = data_fr.explode('categories')

In [16]:
df_exploded.head(20)

Unnamed: 0,title,user,timestamp,categories,datetime
0,Antoine_Meillet,Curry,1031518186,arte,2002-09-08 20:49:46
0,Antoine_Meillet,Curry,1031518186,geografia,2002-09-08 20:49:46
0,Antoine_Meillet,Curry,1031518186,historia,2002-09-08 20:49:46
0,Antoine_Meillet,Curry,1031518186,politica,2002-09-08 20:49:46
1,Antoine_Meillet,script_de_conversion,1036059091,arte,2002-10-31 10:11:31
1,Antoine_Meillet,script_de_conversion,1036059091,geografia,2002-10-31 10:11:31
1,Antoine_Meillet,script_de_conversion,1036059091,historia,2002-10-31 10:11:31
1,Antoine_Meillet,script_de_conversion,1036059091,politica,2002-10-31 10:11:31
2,Antoine_Meillet,Francis,1036336188,arte,2002-11-03 15:09:48
2,Antoine_Meillet,Francis,1036336188,geografia,2002-11-03 15:09:48


In [23]:
df_exploded.to_pickle('df_fr_processed')

In [18]:
users_per_category_fr = df_exploded.groupby('categories')['user'].nunique()

In [21]:
users_per_category_fr

categories
arte          115336
deporte        43664
derecho        70693
eventos        99840
filosofia      62825
geografia     117301
historia      157533
juegos         21912
matematica     26984
naturaleza     61967
politica      120976
religion       44883
salud          76725
Name: user, dtype: int64

# Process spanish data

In [16]:
data_es=load_and_prepare_data('WPData/es_wiki.txt')

In [17]:
%%time
import json
data_es['categories'] = data_es['categories'].apply(json.loads)
data_es.head(20)

CPU times: user 17.5 s, sys: 37.4 s, total: 54.9 s
Wall time: 3min 59s


Unnamed: 0,title,user,timestamp,categories,datetime
0,Gregg_Sulkin,87.217.240.27,1287843598,"[arte, derecho, eventos, filosofia, historia, ...",2010-10-23 14:19:58
1,Gregg_Sulkin,87.217.240.27,1287843504,"[arte, derecho, eventos, filosofia, historia, ...",2010-10-23 14:18:24
2,Sonangol,Sentoan,1287843154,"[derecho, filosofia, geografia, historia, natu...",2010-10-23 14:12:34
3,Matt_&_Kim,Shining.Star,1287842935,"[arte, derecho, filosofia]",2010-10-23 14:08:55
4,Matt_&_Kim,Shining.Star,1287842910,"[arte, derecho, filosofia]",2010-10-23 14:08:30
5,Ricardo_Aparicio_y_Aparicio,Petronas,1287842871,"[derecho, eventos, filosofia, historia, politica]",2010-10-23 14:07:51
6,Ricardo_Aparicio_y_Aparicio,Petronas,1287842823,"[derecho, eventos, filosofia, historia, politica]",2010-10-23 14:07:03
7,Ricardo_Aparicio_y_Aparicio,Petronas,1287842803,"[derecho, eventos, filosofia, historia, politica]",2010-10-23 14:06:43
8,Proyecto_Agorá,AWUY,1287843205,"[matematica, politica, religion]",2010-10-23 14:13:25
9,Proyecto_Agorá,AWUY,1287842345,"[matematica, politica, religion]",2010-10-23 13:59:05


In [34]:
null_or_empty = data_es['categories'].isna() | (data_es['categories'].map(len) == 0)

# Filter the DataFrame based on the condition
filtered_df = data_es[null_or_empty]

In [38]:
data_es.to_pickle('converted/df_es')

In [39]:
data_es.sample(20)

Unnamed: 0,title,user,timestamp,categories,datetime
1039298,Carlos_Slim,Asfarer,1271945500,"[derecho, eventos, filosofia, historia, politica]",2010-04-22 14:11:40
11988716,Gonzalo_de_Alvarado,88.87.214.30,1203526717,"[derecho, filosofia, historia, politica]",2008-02-20 16:58:37
9696685,Avenida_Cristóbal_Colón_(Santiago),142.59.109.203,1162960439,[historia],2006-11-08 04:33:59
9464047,Prunus_spinosa,Cookie,1173997203,"[geografia, historia, naturaleza, salud]",2007-03-15 22:20:03
4400751,Ana_Lucía_Domínguez,Tigresoft,1186162316,"[arte, derecho, eventos, filosofia, historia]",2007-08-03 17:31:56
12818984,Counter-Strike,FedericoMP,1170280006,[derecho],2007-01-31 21:46:46
1256130,Estadio_North_Harbour,YoaR,1244983679,"[arte, deporte, eventos, historia]",2009-06-14 12:47:59
4316677,Función_cuadrática,Paisvasco,1279444341,"[filosofia, matematica]",2010-07-18 09:12:21
16578713,Unión_Europea,RubencinMdE,1276265754,"[derecho, eventos, filosofia, geografia, histo...",2010-06-11 14:15:54
2881335,Somos_tú_y_yo,Marcos94,1223848289,"[arte, historia]",2008-10-12 21:51:29


In [40]:
dfes_exploded = data_es.explode('categories')

In [41]:
users_per_category_es = dfes_exploded.groupby('categories')['user'].nunique()
users_per_category_es

categories
arte          1165179
deporte        398988
derecho       1102277
eventos        998607
filosofia     1302309
geografia      716028
historia      1709310
juegos         124548
matematica     159728
naturaleza     867772
politica       868474
religion       372573
salud          432283
Name: user, dtype: int64

## Process deutch data

In [3]:
file_path = 'WPData/de_wiki.txt'
replace_in_file_comma_spaces(file_path)

In [8]:
data_de=load_and_prepare_data('WPData/de_wiki.txt')

In [9]:
import json

data_de['categories'] = data_de['categories'].apply(json.loads)

In [10]:
data_de.sample(10)

Unnamed: 0,title,user,timestamp,categories,datetime
9827145,Robert_Anderson_(Polizist),Gaudio,1243867238,[religion],2009-06-01 14:40:38
15920825,Stabkirche_Heddal,Pinzgauer,1155159157,"[arte, religion]",2006-08-09 21:32:37
10973172,Liste_Schweizer_Schriftsteller,ThoR,1209512287,[arte],2008-04-29 23:38:07
23439587,Architektur_der_Renaissance,Giano_II,1161526986,"[arte, historia]",2006-10-22 14:23:06
1274519,Vincent_van_Gogh,HaeB,1137004044,"[arte, religion]",2006-01-11 18:27:24
14159704,Ada_(Karien),Bergantini,1243361661,[arte],2009-05-26 18:14:21
2900038,Paul_Heyse,Aka,1142680138,[arte],2006-03-18 11:08:58
170638,Demokratie,Helenopel,1229190979,"[derecho, filosofia, historia, politica]",2008-12-13 17:56:19
18915691,UEFA-Pokal_2008/09,Wiki12345,1222547376,[deporte],2008-09-27 20:29:36
22317156,Danijel_Aleksić,Muck31,1251666980,[deporte],2009-08-30 21:16:20


In [11]:
data_de.to_pickle('converted/df_de')

In [15]:
unique_users_de = data_de['user'].nunique()
print("Deutch unique users:", unique_users_de)

Deutch unique users: 357575


In [18]:
#dfde_exploded = data_de.explode('categories')
#users_per_category_de = dfde_exploded.groupby('categories')['user'].nunique()
print("Deutch users by category:")
users_per_category_de

Deutch users by category:


categories
arte          235673
deporte        65686
derecho        69434
eventos        61160
filosofia      27019
geografia     107563
historia      143560
juegos         22460
matematica     19301
naturaleza     79276
politica      133351
religion       73279
salud          24653
Name: user, dtype: int64

# Processing AR data

In [20]:
file_path = 'WPData/ar_wp.txt'
replace_in_file_comma_spaces(file_path)

In [27]:
data_ar=load_and_prepare_data('WPData/ar_wp.txt')
import json

data_ar['categories'] = data_ar['categories'].apply(json.loads)

In [30]:
data_ar.head(10)

Unnamed: 0,title,user,timestamp,categories,datetime
0,ماء,piolinfax,1057882982,"[art, foods, games, geography, health, history...",2003-07-11 00:23:02
1,ماء,piolinfaxx,1058466271,"[art, foods, games, geography, health, history...",2003-07-17 18:24:31
2,ماء,piolinfaxx,1058466334,"[art, foods, games, geography, health, history...",2003-07-17 18:25:34
3,ماء,piolinfaxx,1058466421,"[art, foods, games, geography, health, history...",2003-07-17 18:27:01
4,ماء,piolinfaxx,1062962670,"[art, foods, games, geography, health, history...",2003-09-07 19:24:30
5,ماء,piolinfaxx,1062963400,"[art, foods, games, geography, health, history...",2003-09-07 19:36:40
6,ماء,traroth,1064325118,"[art, foods, games, geography, health, history...",2003-09-23 13:51:58
7,ماء,عصام_بايزيدي,1076234052,"[art, foods, games, geography, health, history...",2004-02-08 09:54:12
8,ماء,عصام_بايزيدي,1080825850,"[art, foods, games, geography, health, history...",2004-04-01 13:24:10
9,ماء,عصام_بايزيدي,1080825869,"[art, foods, games, geography, health, history...",2004-04-01 13:24:29


In [29]:
data_ar.to_pickle('converted/df_ar')

In [33]:
unique_users_ar = data_ar['user'].nunique()
print("AR unique users:", unique_users_ar)
dfar_exploded = data_de.explode('categories')
users_per_category_ar = dfar_exploded.groupby('categories')['user'].nunique()
print("AR users by category:")
users_per_category_ar

AR unique users: 23641
AR users by category:


categories
art            20669
events         14185
foods           2021
games           4753
geography      20347
health         13616
history        21199
mathematics     6780
nature         22499
philosophy     15086
politics       21953
religion       14612
rights          7980
sports          6940
Name: user, dtype: int64

## Processing HU Data

In [38]:
file_path = 'WPData/hu_wiki.txt'
replace_in_file_comma_spaces(file_path)
replace_in_file_spaces(file_path)

In [39]:
data_hu=load_and_prepare_data('WPData/hu_wiki.txt')
import json

data_hu['categories'] = data_hu['categories'].apply(json.loads)

In [40]:
data_hu.sample(10)

Unnamed: 0,title,user,timestamp,categories,datetime
2410790,Karl_May,Fausto,1253460335,"[arte, historia, naturaleza]",2009-09-20 15:25:35
1923320,Bod_Péter,Csigabi,1250698031,"[arte, derecho, historia, politica, religion]",2009-08-19 16:07:11
2662800,A_Magyar_Tudományos_Akadémia_tagjainak_listája...,Pasztilla,1286822482,[geografia],2010-10-11 18:41:22
1209292,Ökölvívók_listája,Tyr,1196097867,[deporte],2007-11-26 17:24:27
641410,Olimpiai_negyedik_helyezett_magyar_sportolók_l...,Karesz52,1221978219,"[deporte, eventos]",2008-09-21 06:23:39
1699198,Szerb_Radikális_Párt,TheGoaT,1161894585,"[geografia, historia, politica]",2006-10-26 20:29:45
881704,Tinód,Kossuthzs.,1205321618,"[geografia, historia]",2008-03-12 11:33:38
47543,Aszó_(Bákó_megye),Szabi237,1285690608,[geografia],2010-09-28 16:16:48
2098225,Ibrahim_Rugova,Mazarin07,1198358304,"[historia, politica, religion]",2007-12-22 21:18:24
1098464,Help_(a_gong),Waltermattau,1193502474,"[arte, historia]",2007-10-27 16:27:54


In [41]:
data_hu.to_pickle('converted/df_hu')

In [42]:
unique_users_hu = data_hu['user'].nunique()
print("AR unique users:", unique_users_hu)
dfhu_exploded = data_hu.explode('categories')
users_per_category_hu = dfhu_exploded.groupby('categories')['user'].nunique()
print("AR users by category:")
users_per_category_hu

AR unique users: 148067
AR users by category:


categories
arte          64258
deporte       17787
derecho       22921
eventos       16989
filosofia     14864
geografia     66744
historia      89581
juegos         5715
matematica     5909
naturaleza    47036
politica      42261
religion      18491
salud         10077
Name: user, dtype: int64

## Processing IT Data

In [43]:
file_path = 'WPData/it_wiki.txt'
replace_in_file_comma_spaces(file_path)

In [44]:
data_it=load_and_prepare_data('WPData/it_wiki.txt')
import json

data_it['categories'] = data_it['categories'].apply(json.loads)

In [45]:
data_it.head(10)

Unnamed: 0,title,user,timestamp,categories,datetime
0,Armonium,ppp-143-207.29-151.libero.it,1000462768,"[arte, geografia]",2001-09-14 10:19:28
1,Armonium,Gianfranco,1039481542,"[arte, geografia]",2002-12-10 00:52:22
2,Armonium,MikyT,1071179375,"[arte, geografia]",2003-12-11 21:49:35
3,Armonium,Sbisolo,1073051385,"[arte, geografia]",2004-01-02 13:49:45
4,Armonium,Twice25,1088372438,"[arte, geografia]",2004-06-27 21:40:38
5,Armonium,Gac,1088575553,"[arte, geografia]",2004-06-30 06:05:53
6,Armonium,Gac,1088575568,"[arte, geografia]",2004-06-30 06:06:08
7,Armonium,Archenzo,1088621569,"[arte, geografia]",2004-06-30 18:52:49
8,Armonium,Twice25,1091139099,"[arte, geografia]",2004-07-29 22:11:39
9,Armonium,M7,1091141564,"[arte, geografia]",2004-07-29 22:52:44


In [46]:
data_it.to_pickle('converted/df_it')

In [48]:
unique_users_it = data_it['user'].nunique()
print("IT unique users:", unique_users_it)
dfit_exploded = data_it.explode('categories')
users_per_category_it = dfit_exploded.groupby('categories')['user'].nunique()
print("IT users by category:")
users_per_category_it

IT unique users: 97162
IT users by category:


categories
arte          53885
deporte       21048
derecho       27933
eventos       52503
filosofia     24890
geografia     45730
historia      38546
juegos        17967
matematica     8641
naturaleza    31976
politica      30482
religion      28233
salud         28341
Name: user, dtype: int64

## Processing JA Data

In [49]:
file_path = 'WPData/ja_wp.txt'
replace_in_file_comma_spaces(file_path)

In [50]:
data_ja=load_and_prepare_data('WPData/ja_wp.txt')
import json

data_ja['categories'] = data_ja['categories'].apply(json.loads)

In [51]:
data_ja.head(10)

Unnamed: 0,title,user,timestamp,categories,datetime
0,アンパサンド,suisui,1092369109,"[art, history, nature]",2004-08-13 03:51:49
1,アンパサンド,suisui,1093021692,"[art, history, nature]",2004-08-20 17:08:12
2,アンパサンド,kzhr,1090561889,"[art, history, nature]",2004-07-23 05:51:29
3,アンパサンド,michey.m,1070463995,"[art, history, nature]",2003-12-03 15:06:35
4,アンパサンド,っ,1087950080,"[art, history, nature]",2004-06-23 00:21:20
5,アンパサンド,っ,1088001066,"[art, history, nature]",2004-06-23 14:31:06
6,アンパサンド,っ,1090642718,"[art, history, nature]",2004-07-24 04:18:38
7,アンパサンド,ndr,1121916254,"[art, history, nature]",2005-07-21 03:24:14
8,アンパサンド,goki,1160972422,"[art, history, nature]",2006-10-16 04:20:22
9,アンパサンド,朝彦,1087528715,"[art, history, nature]",2004-06-18 03:18:35


In [52]:
data_ja.to_pickle('converted/df_ja')

In [54]:
unique_users_ja = data_ja['user'].nunique()
print("JA unique users:", unique_users_ja)
dfja_exploded = data_ja.explode('categories')
users_per_category_ja = dfja_exploded.groupby('categories')['user'].nunique()
print("JA users by category:")
users_per_category_ja

JA unique users: 126657
JA users by category:


categories
art             85810
events          20072
foods           14787
games           28442
geography       79168
health          45394
history        103004
mathematics     18879
nature          86584
philosophy      44951
politics        66756
religion        29220
rights          31483
sports          34954
Name: user, dtype: int64

## Processing PT Data

In [55]:
file_path = 'WPData/pt_wiki.txt'
replace_in_file_comma_spaces(file_path)

In [56]:
data_pt=load_and_prepare_data('WPData/pt_wiki.txt')
import json

data_pt['categories'] = data_pt['categories'].apply(json.loads)

ValueError: non convertible value ^^^_2011-10-25T04:34:47Z with the unit 's', at position 0

In [51]:
data_pt.head(10)

Unnamed: 0,title,user,timestamp,categories,datetime
0,アンパサンド,suisui,1092369109,"[art, history, nature]",2004-08-13 03:51:49
1,アンパサンド,suisui,1093021692,"[art, history, nature]",2004-08-20 17:08:12
2,アンパサンド,kzhr,1090561889,"[art, history, nature]",2004-07-23 05:51:29
3,アンパサンド,michey.m,1070463995,"[art, history, nature]",2003-12-03 15:06:35
4,アンパサンド,っ,1087950080,"[art, history, nature]",2004-06-23 00:21:20
5,アンパサンド,っ,1088001066,"[art, history, nature]",2004-06-23 14:31:06
6,アンパサンド,っ,1090642718,"[art, history, nature]",2004-07-24 04:18:38
7,アンパサンド,ndr,1121916254,"[art, history, nature]",2005-07-21 03:24:14
8,アンパサンド,goki,1160972422,"[art, history, nature]",2006-10-16 04:20:22
9,アンパサンド,朝彦,1087528715,"[art, history, nature]",2004-06-18 03:18:35


In [52]:
data_pt.to_pickle('converted/df_pt')

In [54]:
unique_users_pt = data_ja['user'].nunique()
print("PT unique users:", unique_users_pt)
dfpt_exploded = data_pt.explode('categories')
users_per_category_pt = dfpt_exploded.groupby('categories')['user'].nunique()
print("PT users by category:")
users_per_category_pt

JA unique users: 126657
JA users by category:


categories
art             85810
events          20072
foods           14787
games           28442
geography       79168
health          45394
history        103004
mathematics     18879
nature          86584
philosophy      44951
politics        66756
religion        29220
rights          31483
sports          34954
Name: user, dtype: int64

## Processing RU Data

In [57]:
file_path = 'WPData/ru_wp.txt'
replace_in_file_comma_spaces(file_path)

In [58]:
data_ru=load_and_prepare_data('WPData/ru_wp.txt')
import json

data_ru['categories'] = data_ru['categories'].apply(json.loads)

In [60]:
data_ru.sample(10)

Unnamed: 0,title,user,timestamp,categories,datetime
4779115,Восток_(клуб_песни),myke,1227284771,[art],2008-11-21 16:26:11
646495,Иприт,deerhunter,1193162117,[nature],2007-10-23 17:55:17
3962033,Институт_рабочего_контроля,lehtman,1201387940,"[history, politics]",2008-01-26 22:52:20
1529668,sony_mobile,lite,1188462859,[history],2007-08-30 08:34:19
4418649,Ленинский_сквер_(Липецк),dsropen,1214923396,"[geography, nature]",2008-07-01 14:43:16
1482766,Монстр_в_коробке,kr,1161153940,"[art, nature]",2006-10-18 06:45:40
3613315,Талыш-Муганская_Автономная_Республика,grandmaster,1226473369,"[geography, history, politics]",2008-11-12 07:02:49
1188510,Муром,maximaximax,1125510758,"[geography, history]",2005-08-31 17:52:38
2542972,Мэн-цзы,unclemartin,1258460623,"[history, nature, philosophy, religion]",2009-11-17 12:23:43
534472,Киноискусство,avb,1249644547,[art],2009-08-07 11:29:07


In [61]:
data_ru.to_pickle('converted/df_ru')

In [62]:
unique_users_ru = data_ru['user'].nunique()
print("RU unique users:", unique_users_ru)
dfru_exploded = data_ru.explode('categories')
users_per_category_ru = dfru_exploded.groupby('categories')['user'].nunique()
print("RU users by category:")
users_per_category_ru

RU unique users: 76066
RU users by category:


categories
art            41694
events         23696
foods           6088
games          13838
geography      30652
health         13937
history        41006
mathematics     9260
nature         51426
philosophy     16170
politics       24119
religion       18301
rights         17644
sports         14963
Name: user, dtype: int64

## Processing VI Data

In [63]:
file_path = 'WPData/vi_wp.txt'
replace_in_file_comma_spaces(file_path)

In [64]:
data_vi=load_and_prepare_data('WPData/vi_wp.txt')
import json

data_vi['categories'] = data_vi['categories'].apply(json.loads)

In [65]:
data_vi.sample(10)

Unnamed: 0,title,user,timestamp,categories,datetime
592998,xử_lý_ngôn_ngữ_tự_nhiên,ditimchanly,1224777979,"[health, history, mathematics, nature, philoso...",2008-10-23 16:06:19
21708,tản_Đà,bthanh,1268924017,"[art, events, geography, history, nature, phil...",2010-03-18 14:53:37
392348,võ_thiếu_lâm,shaolin_kungfu,1188293247,"[art, games, geography, health, nature, politi...",2007-08-28 09:27:27
64320,nam_tiến,nds,1200134053,"[geography, history, nature, politics]",2008-01-12 10:34:13
33717,việt_nam_dân_chủ_cộng_hòa,y_kpia_mlo,1254591533,"[art, events, geography, history, nature, phil...",2009-10-03 17:38:53
427409,jethro_tull_(nhà_nông_học),genghiskhan,1189074861,"[events, geography, history, nature, politics]",2007-09-06 10:34:21
711043,friedrich_i_của_Đế_quốc_la_mã_thần_thánh,trungda,1267955673,"[events, geography, history, nature, politics,...",2010-03-07 09:54:33
83685,người_pà_thẻn,ttienngoc,1125816094,"[geography, history, nature, philosophy, polit...",2005-09-04 06:41:34
609534,công_khai_thiên_hướng_tình_dục,mặt_trời_đỏ,1239311690,"[health, nature, philosophy, politics]",2009-04-09 21:14:50
677134,fernando_verdasco,a_trát_liên_tạp,1254033806,"[events, games, health, history, nature, sports]",2009-09-27 06:43:26


In [66]:
data_vi.to_pickle('converted/df_vi')

In [67]:
unique_users_vi = data_vi['user'].nunique()
print("VI unique users:", unique_users_vi)
dfvi_exploded = data_vi.explode('categories')
users_per_category_vi = dfvi_exploded.groupby('categories')['user'].nunique()
print("VI users by category:")
users_per_category_vi

VI unique users: 12262
VI users by category:


categories
art             6478
events          6669
foods            627
games           3341
geography       9759
health          5722
history         8670
mathematics     3700
nature         10107
philosophy      8164
politics       10385
religion        2729
rights          2082
sports          1473
Name: user, dtype: int64

## Processing ZH Data

In [68]:
file_path = 'WPData/zh_wp.txt'
replace_in_file_comma_spaces(file_path)

In [69]:
data_zh=load_and_prepare_data('WPData/zh_wp.txt')
import json

data_zh['categories'] = data_zh['categories'].apply(json.loads)

In [70]:
data_zh.sample(10)

Unnamed: 0,title,user,timestamp,categories,datetime
2480348,sts-2,tianxiaozhang,1180930541,[geography],2007-06-04 04:15:41
2716665,足球之夜,chowhotin,1191908718,[art],2007-10-09 05:45:18
299429,惡搞文化,kw9329,1202969818,"[art, nature]",2008-02-14 06:16:58
222930,胡適,tedytj,1267505024,"[art, history, nature, philosophy, politics]",2010-03-02 04:43:44
2240906,莫斯科戰役,evers,1258277807,[history],2009-11-15 09:36:47
1091773,氧化剂,cough,1113517975,[art],2005-04-14 22:32:55
4282176,安提帕特,djhuty,1253561734,"[art, politics]",2009-09-21 19:35:34
3027189,色格拉布魯日皇家體育會,msuker,1237713081,[sports],2009-03-22 09:11:21
1967912,梁靜茹,happyrabbit,1230887661,"[art, nature]",2009-01-02 09:14:21
2460226,唇顎裂,tonync,1155884739,"[health, history, nature]",2006-08-18 07:05:39


In [71]:
data_zh.to_pickle('converted/df_zh')

In [72]:
unique_users_zh = data_zh['user'].nunique()
print("ZH unique users:", unique_users_zh)
dfzh_exploded = data_zh.explode('categories')
users_per_category_zh = dfzh_exploded.groupby('categories')['user'].nunique()
print("ZH users by category:")
users_per_category_zh

ZH unique users: 76613
ZH users by category:


categories
art            54766
events         19625
foods           5436
games           8721
geography      35855
health         18639
history        36451
mathematics    19234
nature         53861
philosophy     23834
politics       40170
religion       17003
rights         17748
sports         12030
Name: user, dtype: int64

## Get processed files stored locally 

In [1]:
%%time
import pandas as pd
df_ar = pd.read_pickle('converted/df_ar')
df_de = pd.read_pickle('converted/df_de')
df_es = pd.read_pickle('converted/df_es')
df_fr = pd.read_pickle('converted/df_fr')
df_hu = pd.read_pickle('converted/df_hu')
df_it = pd.read_pickle('converted/df_it')

KeyboardInterrupt: 

In [2]:
%%time
import pandas as pd

df_et = pd.read_pickle('converted/df_it')

CPU times: user 5.74 s, sys: 17.2 s, total: 23 s
Wall time: 31.2 s


In [14]:
%%time
import pickle

data = pickle.load('converted/df_es')

TypeError: file must have 'read' and 'readline' attributes

In [5]:
df_es.sample(20)

Deutch unique users: 2682096


17064686

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle

# Define the directory where your pickle files are stored
directory = 'converted/'

# Initialize an empty DataFrame to hold all the data
combined_df = pd.DataFrame()

# Loop through all the pickle files in the directory
for filename in os.listdir(directory):
    if filename.startswith('df_'):
        print('Reading file: ',filename)
        # Determine the language from the filename, stripping the 'df_' prefix
        language = filename.split('_')[1]
        
        # Load the pickle file
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as file:
            data = pickle.load(file)
        
        # Create a DataFrame and add the language column
        df = pd.DataFrame(data)
        df['language'] = language
        
        # Append to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)
        print('File read complete: ',filename)

# Explode the categories list into separate rows
#combined_df = combined_df.explode('categories')

# Group by language and category and count the number of unique users
#category_counts = combined_df.groupby(['language', 'categories']).agg({'user': pd.Series.nunique}).reset_index()

# Pivot the data for visualization
#pivot_table = category_counts.pivot(index='categories', columns='language', values='user')

# Plotting
#pivot_table.plot(kind='bar', figsize=(14, 7))
#plt.title('Number of Unique Users by Category for Each Language')
#plt.xlabel('Category')
#plt.ylabel('Number of Users')
#plt.legend(title='Language')
#plt.show()


Reading file:  df_ar
File read complete:  df_ar
Reading file:  df_de
File read complete:  df_de
Reading file:  df_hu
File read complete:  df_hu
Reading file:  df_zh
File read complete:  df_zh
Reading file:  df_es
File read complete:  df_es
Reading file:  df_fr
File read complete:  df_fr
Reading file:  df_vi
File read complete:  df_vi
Reading file:  df_ru
File read complete:  df_ru
Reading file:  df_ja
File read complete:  df_ja
Reading file:  df_it


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import pickle

# Define the directory where your pickle files are stored
directory = 'converted/'

# Initialize an empty DataFrame to hold all the data
combined_df = pd.DataFrame()

# Loop through all the pickle files in the directory
for filename in os.listdir(directory):
    if filename.startswith('df_'):
        print('Reading file: ',filename)
        # Determine the language from the filename, stripping the 'df_' prefix
        language = filename.split('_')[1]
        
        # Load the pickle file
        filepath = os.path.join(directory, filename)
        with open(filepath, 'rb') as file:
            data = pickle.load(file)
        
        # Create a DataFrame and add the language column
        df = pd.DataFrame(data)
        df['language'] = language
        
        # Append to the combined DataFrame
        combined_df = pd.concat([combined_df, df], ignore_index=True)
        print('File read complete: ',filename)

# Explode the categories list into separate rows
#combined_df = combined_df.explode('categories')

# Group by language and category and count the number of unique users
#category_counts = combined_df.groupby(['language', 'categories']).agg({'user': pd.Series.nunique}).reset_index()

# Pivot the data for visualization
#pivot_table = category_counts.pivot(index='categories', columns='language', values='user')

# Plotting
#pivot_table.plot(kind='bar', figsize=(14, 7))
#plt.title('Number of Unique Users by Category for Each Language')
#plt.xlabel('Category')
#plt.ylabel('Number of Users')
#plt.legend(title='Language')
#plt.show()


Reading file:  df_ar
File read complete:  df_ar
Reading file:  df_de
File read complete:  df_de
Reading file:  df_hu
File read complete:  df_hu
Reading file:  df_zh
File read complete:  df_zh
Reading file:  df_es
File read complete:  df_es
Reading file:  df_fr
File read complete:  df_fr
Reading file:  df_vi
File read complete:  df_vi
Reading file:  df_ru
File read complete:  df_ru
Reading file:  df_ja


In [17]:
import pandas as pd
from datetime import datetime
import json



# Function to count unique users per category
def count_users_per_category(data):
    # Explode the categories list into separate rows
    categories_expanded = data.explode('categories')
    
    # Group by category and count unique users
    users_per_category = categories_expanded.groupby('categories')['user'].nunique()
    
    return users_per_category

# Load French and Spanish data
file_path_fr = 'WPData/fr_wiki.txt'
file_path_es = 'WPData/es_wiki.txt'
data_fr = load_and_prepare_data(file_path_fr)
data_es = load_and_prepare_data(file_path_es)

# Count unique articles in each dataset
unique_articles_fr = data_fr['title'].nunique()
unique_articles_es = data_es['title'].nunique()

# Count unique users in each dataset
unique_users_fr = data_fr['user'].nunique()
unique_users_es = data_es['user'].nunique()

# Find the most recent edit date in each dataset
most_recent_edit_fr = data_fr['datetime'].max()
most_recent_edit_es = data_es['datetime'].max()

# Count users per category for both datasets
users_per_category_fr = count_users_per_category(data_fr)
users_per_category_es = count_users_per_category(data_es)

# Output results
print(f"French data - Unique articles: {unique_articles_fr}, Unique users: {unique_users_fr}, Most recent edit: {most_recent_edit_fr}")
print(f"Spanish data - Unique articles: {unique_articles_es}, Unique users: {unique_users_es}, Most recent edit: {most_recent_edit_es}")
print("French Users per Category:")
print(users_per_category_fr)
print("Spanish Users per Category:")
print(users_per_category_es)


KeyboardInterrupt: 