In [None]:
#imports
import sklearn
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import pyarrow.parquet as pa
import contractions
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize

# Download the tokenizer (one-time setup)
nltk.download('punkt_tab')




In [None]:
#Reading the dataset and displaying it
data = pd.read_parquet('train-00000-of-00001-a0f92f8fbc6b2308.parquet')
print(data.columns)
print(data.info())



In [None]:
#Changing the data from parquet to csv 
data.to_csv('dataset.csv', index=False)
data = pd.read_csv('dataset.csv')

In [None]:
"""
# Step 2: Define a cleaning function
def clean_text(text):
    if not isinstance(text, str):  # Handle missing or non-string values
        return ""
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[0-9]', '', text)  # Remove all numeric digits
    text = re.sub(r'[^\w\s\']', '', text)  # Remove unwanted characters (keep words, spaces, and apostrophes)
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text.strip()  # Remove leading and trailing whitespace

# Step 3: Apply cleaning to all relevant text columns
text_columns = ['text']  # Update this with the list of text columns to clean
for column in text_columns:
    data[f'{column}_cleaned'] = data[column].apply(clean_text)

# Step 4: Handle missing values (optional)
for column in text_columns:
    data.dropna(subset=[f'{column}_cleaned'], inplace=True)  # Drop rows where the cleaned text is missing

# Step 5: Save the cleaned data back to CSV
data.to_csv('cleaned_dataset.csv', index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'")

# Step 6: Preview the cleaned data
print(data.head())
"""

def clean_text(text):
    if isinstance(text, str):
     

        # Remove special characters and digits, but preserve all alphabetic characters
        
        text = re.sub(r'[^a-zA-ZáéíóúüñÁÉÍÓÚÜÑа-яА-ЯёЁ\u4e00-\u9fff\uac00-\ud7af\u3040-\u30ff\u3130-\u318f\s]', '', text)
        
        # Remove extra whitespace
        text = text.strip()
        # Expand contractions
        text = contractions.fix(text)
        
        return text
    else:
        return ""

# Step 2: Apply the cleaning function to the 'text' column
data['cleaned_text'] = data['text'].apply(clean_text)

# Step 3: Remove NaN rows
data.dropna(subset=['cleaned_text'], inplace=True)

# Step 4: Tokenization (split the cleaned text into words)
def tokenize_text(text):
    return word_tokenize(text.lower())  # Tokenize and make lowercase for standardization

data['tokens'] = data['cleaned_text'].apply(tokenize_text)



In [None]:
# Step 5: Encode the 'language' column using LabelEncoder
label_encoder = LabelEncoder()
data['language_encoded'] = label_encoder.fit_transform(data['language_code'])

# Step 6: Print the mapping of original labels to numeric labels
print("Mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))

# Step 7: Print the DataFrame with language and encoded labels
print(data[['language_code', 'language_encoded']])

# Step 8: Optional - Inverse transformation: converting encoded values back to original labels
inverse_labels = label_encoder.inverse_transform(data['language_encoded'])
print("Inverse transformation result:", inverse_labels)

In [None]:
data.to_csv('cleaned_dataset.csv', index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'")
print(data.info())
print(data.columns)
data.head(n=15)

In [None]:
# Ensure 'text' column contains cleaned text data
# Assuming the column you are using for text is named 'text' in your dataframe
# If you need to adjust this column name, modify 'df['text']'

# Initialize TfidfVectorizer with trigram (n=3) as n-gram range
vectorizer = TfidfVectorizer(max_features = 5000)  # Trigrams (n=3)

# Fit and transform the text data
X = vectorizer.fit_transform(data['cleaned_text'])  # 'text' column containing the cleaned text data

# Convert the sparse matrix to a dense format to view the results

# Optionally, check the shape of the matrix (rows = documents, columns = features)
print({X.shape})



In [None]:
#Exploratory data analysis EDA
data['language_code'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Language distribution')
plt.xlabel('Language')
plt.ylabel('count')
plt.show()

In [None]:
data.isnull().sum()

In [None]:
data['text_length'] = data['cleaned_text'].apply(len)  
data['text_length'].hist(color='skyblue', bins=30)
plt.title('Text Length Distribution')
plt.xlabel('Text Length')
plt.ylabel('Frequency')
plt.show()

In [None]:
for lang in data['language_code'].unique():  # Loop over each unique language in the 'language' column
    # Filter the data for the current language
    lang_data = data[data['language_code'] == lang]
    wordcloud = WordCloud(width=800, height=400).generate(' '.join(lang_data['cleaned_text']))  # Adjust 'cleaned_text' if your column has a different name
plt.imshow(wordcloud, interpolation='bilinear')  # 'bilinear' interpolation for smoother edges
plt.axis('off')  # Turn off the axis
plt.title(f'Most Frequent Words in {lang}')  # Title with the language name
plt.show()  # Show the wordcloud for the current language