# Step2: Cleaning the dataset

## 1- Importing the libraries:

In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## 2- Load the dataset:

In [2]:
# Load the file you just created
df = pd.read_csv('../data/processed/combined_dataset.csv')

# Check if it looks right
print("Data shape:", df.shape)
print(df.head())

Data shape: (234164, 2)
                                          text  language
0           Un avion est en train de décoller.  Français
1              Un homme est en train de fumer.  Français
2       Une personne jette un chat au plafond.  Français
3  Une femme prend et tient un bébé kangourou.  Français
4                   Un homme joue de la flûte.  Français


## 3-Function for cleaning the dataset:

In [3]:
def clean_text(text):
    # 1. Convert to lowercase (French/English)
    text = str(text).lower() 
    
    # 2. Remove URLs (http://...)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 3. Remove punctuation EXCEPT specifically useful characters if needed
    # (For now, we remove standard punctuation like !?,.)
    text = re.sub(r'[^\w\s]', '', text)
    
    # 4. Remove extra spaces
    text = text.strip()
    
    return text

# Apply the function to your text column
df['text_clean'] = df['text'].apply(clean_text)

print("Cleaning complete. Example:")
print(df[['text', 'text_clean']].head())
df_small = df.sample(frac=0.1, random_state=42)

Cleaning complete. Example:
                                          text  \
0           Un avion est en train de décoller.   
1              Un homme est en train de fumer.   
2       Une personne jette un chat au plafond.   
3  Une femme prend et tient un bébé kangourou.   
4                   Un homme joue de la flûte.   

                                   text_clean  
0           un avion est en train de décoller  
1              un homme est en train de fumer  
2       une personne jette un chat au plafond  
3  une femme prend et tient un bébé kangourou  
4                   un homme joue de la flûte  


## 4-Split the data into training and testing :

In [4]:
X = df_small['text_clean']  # The input (features)
y = df_small['language']    # The answer (target)

# Split: 80% for training, 20% for testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training samples: {len(X_train)}")
print("training examples:")
print(X_train.tail(10))
print(f"Testing samples: {len(X_test)}")
print("testing examples:")
print(X_test.tail(10))

Training samples: 18732
training examples:
116329          if you promise to be back by 230 you may go
47235     il y a des phrases de merde que je me refuse à...
48226     mes connaissances en allemand sont malheureuse...
46593                             vous êtes en panne didées
183735    المهم بنكيران ماكيديرش المرقةغادي تكون المدينة...
73304     je parle luxembourgeois allemand français et a...
164925                                    chkoun 9l9ek liya
52983                                           tom saluait
75444       the motorway was closed due to a major accident
218695    منطقة سيرينجيتي فيها لبارك لوطني نتاع سيرينجيت...
Name: text_clean, dtype: object
Testing samples: 4684
testing examples:
155291                                    malk mkarfas rask
190895    قال توم دوكسبري مدير مشروع ستاردست غادي تتحرك ...
7992      la personne en veste bleue porte un casque coloré
58660     nous sommes semblables à celui qui a oublié sa...
158912                                       

## 5- : Vectorization

In [6]:
# Create the Vectorizer
vectorizer = TfidfVectorizer(
    analyzer='char',       # Look at characters, not words
    ngram_range=(1, 3),    # Look at 1, 2, and 3 characters at a time
    max_features=5000      # Limit to top 5000 patterns to keep it fast
)

# Learn vocabulary from X_train and transform it
X_train_tfidf = vectorizer.fit_transform(X_train)

# ONLY transform X_test (do not fit on it!)
X_test_tfidf = vectorizer.transform(X_test)

print("Vectorization complete.")
print(f"Matrix shape: {X_train_tfidf.shape}")

Vectorization complete.
Matrix shape: (18732, 5000)


In [7]:
import joblib

# 1. Pack all the variables you need into a list or tuple
data_to_save = (X_train_tfidf, X_test_tfidf, y_train, y_test, vectorizer)

# 2. Save them to a file named 'processed_data.pkl'
joblib.dump(data_to_save, '../data/processed/processed_data.pkl')

print("✅ Data and Vectorizer saved successfully!")

✅ Data and Vectorizer saved successfully!
