In [None]:
import pyarrow as pa
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
lengths_path = "" # Where is the parquet file with the length of each article in tokens '/net/projects/interp/lengths.arrow'
db_name = "" # SQL database name, e.g. 'interp'
username = "" # DB username, e.g. 'muchane'
url = "" # DB url/IP address, e.g. 127.0.0.1
connection_string = "" # SQL connection string, e.g. 'postgresql://muchane@localhost/interp'


In [None]:
with pa.OSFile(lengths_path, 'rb') as source:
    with pa.ipc.open_file(source) as reader:
       df = reader.read_pandas()


In [None]:
df["length"].sum()

In [None]:
df = df.sort_values(by="length",ascending=False)

In [None]:
df = df.iloc[0:3921600]

In [None]:
len(df)

In [None]:
df.head(20)

In [None]:
import pandas as pd
import psycopg2
from psycopg2.extras import execute_values
from concurrent.futures import ProcessPoolExecutor
import os

# Database connection parameters
DB_PARAMS = {
    'dbname': db_name,
    'user': username,
    'host': url,
    'port': 5432  # Change if your PostgreSQL uses a different port
}

# Number of processes to use
NUM_PROCESSES = os.cpu_count()

# Batch size for updates
BATCH_SIZE = 10000

def update_batch(uids):
    conn = psycopg2.connect(**DB_PARAMS)
    cur = conn.cursor()
    
    try:
        # Prepare the update query
        update_query = """
        UPDATE wikipedia_text
        SET acts = -1
        WHERE uid = ANY(%s)
        """
        
        # Execute the update
        cur.execute(update_query, (uids,))
        conn.commit()
    except Exception as e:
        print(f"Error updating batch: {e}")
        conn.rollback()
    finally:
        cur.close()
        conn.close()

def process_uids(uids):
    for i in range(0, len(uids), BATCH_SIZE):
        batch = uids[i:i+BATCH_SIZE]
        update_batch(batch)

def main(df):
    # Convert Series to list
    uid_list = df['uid'].tolist()

    # Split UIDs into chunks for each process
    chunk_size = len(uid_list) // NUM_PROCESSES
    uid_chunks = [uid_list[i:i+chunk_size] for i in range(0, len(uid_list), chunk_size)]

    # Use ProcessPoolExecutor to update in parallel
    with ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
        executor.map(process_uids, uid_chunks)

    print("Update completed.")

In [None]:
main(new_df)

In [None]:
import connectorx as cx

In [None]:
def load_wikipedia_text_to_dataframe(k=None,num_threads=64):
    # Connection parameters
    conn_params = connection_string
    if k:
        query = f"""
        SELECT uid, lang, lang_id
        FROM wikipedia_text
        LIMIT {k};
        """
    else:
        query = f"""
        SELECT uid, lang, lang_id
        FROM wikipedia_text
        """
    if k:
        df = cx.read_sql(conn_params, query, return_type="pandas")
    else:
        df = cx.read_sql(conn_params, query, partition_on="uid", partition_num=num_threads,return_type="pandas")

    return df

In [None]:
lang_df = load_wikipedia_text_to_dataframe()

In [None]:
lang_df_test = df.join(lang_df,on="uid")

In [None]:
lang_df = lang_df.sort_values(by="uid").reset_index(drop=True)
df = df.sort_values(by="uid").reset_index(drop=True)

In [None]:
df["lang_id"] = lang_df["lang_id"]

In [None]:
def select_rows(df,id_df):
    total_active_users = id_df['active_users'].sum()

    # Calculate the number of rows to select for each lang_id
    rows_per_lang = id_df.apply(lambda row: int(3921600 * (row['active_users'] / total_active_users)), axis=1)

    # Initialize a list to store selected rows and a counter for unselected rows
    selected_rows = []
    unselected_count = 0

    # Process each lang_id
    for lang_id in range(128):
        # Get the number of rows to select for this lang_id
        n = rows_per_lang[lang_id]
    
        # Filter df for the current lang_id, ensure length >= 255, and sort by length in descending order
        lang_df = df[(df['lang_id'] == lang_id) & (df['length'] >= 255)].sort_values('length', ascending=False)
    
        # Calculate the number of rows to select
        num_rows = min(len(lang_df), n)
        select_range = min(len(lang_df), 2*n)
        rows_to_select = int(0.5 * select_range)
    
        # Select rows
        if num_rows < n:
            selected = lang_df.head(num_rows)
            unselected_count += n - num_rows
        else:
            selected = lang_df.head(select_range).sample(n=rows_to_select, random_state=42)
    
        selected_rows.append(selected)
        unselected_count += n - len(selected)

    # Combine all selected rows
    result_df = pd.concat(selected_rows)

    # Calculate how many more rows need to be selected
    remaining_rows = 3921600 - len(result_df)

    # Select remaining rows from lang_id 0
    lang_0_df = df[(df['lang_id'] == 0) & (df['length'] >= 255)].sort_values('length', ascending=False)
    lang_0_selected = lang_0_df[~lang_0_df.index.isin(result_df.index)]
    additional_rows = lang_0_selected.head(remaining_rows + unselected_count)

    # Add the additional rows to the result
    result_df = pd.concat([result_df, additional_rows])

    # Ensure we have exactly 3921600 rows
    result_df = result_df.head(3921600)

    print(f"Total rows selected: {len(result_df)}")
    return result_df

In [None]:

id_df = pd.read_csv("table.csv")[['WP\ncode','Active\nusers']].drop_duplicates().reset_index(drop=True)

# Create an 'id' field starting at 0
id_df['id'] = range(len(id_df))

# Ensure 'en' and 'simple' have the same id value
en_index = id_df[id_df['WP\ncode'] == 'en'].index
simple_index = id_df[id_df['WP\ncode'] == 'simple'].index

if not en_index.empty and not simple_index.empty:
    en_value = id_df.loc[en_index, 'id'].values[0]
    id_df.loc[simple_index, 'id'] = en_value
    
    # Adjust the ids for rows after 'simple'
    rows_to_adjust = id_df.index > simple_index[0]
    id_df.loc[rows_to_adjust, 'id'] -= 1

id_df = id_df.rename(columns={'Active\nusers':'active_users','id': 'lang_id'})
id_df = id_df.loc[id_df['lang_id'] < 128]
id_df['active_users'] = id_df['active_users'].str.replace(',', '').astype(int)
id_df = id_df.groupby(['lang_id'], as_index=False)['active_users'].sum()
id_df

In [None]:
df = df.loc[df["lang_id"] < 128]

In [None]:
new_df = select_rows(df,id_df)

In [None]:
new_df["length"].min()

In [None]:
N = 25  # You can adjust this number as needed

# Calculate the percentage of each language
new_df["lang_id_str"] = new_df["lang_id"].astype(str)
lang_percentages = new_df['lang_id_str'].value_counts(normalize=True) * 100

# Separate top N languages and group the rest as 'Other'
top_n = lang_percentages.nlargest(N)
other = pd.Series({'Other': lang_percentages[N:].sum()})
lang_percentages_grouped = pd.concat([top_n, other])

# Sort the percentages in descending order
lang_percentages_sorted = lang_percentages_grouped.sort_values(ascending=False)

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Create the bar chart
bars = ax.bar(lang_percentages_sorted.index, lang_percentages_sorted.values)

# Customize the chart
ax.set_xlabel('Language')
ax.set_ylabel('Percentage')
ax.set_title(f'Percentage of Rows by Language (Top {N} + Other)')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')

# Add percentage labels on top of each bar
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%',
            ha='center', va='bottom')

# Adjust layout to prevent cutting off labels
plt.tight_layout()

# Show the plot
plt.savefig("lang_stats.png")
plt.show()


In [None]:
new_df['uid']