In [1]:
import pandas as pd

# Spécifiez le chemin du fichier
augment_file = "cefr_leveled_texts.csv"

# Chargez le fichier CSV dans un dataframe
df_aug = pd.read_csv(augment_file)

# Affichez les premières lignes pour vérifier la structure du dataframe
df_aug.head()

Unnamed: 0,text,label
0,Hi!\nI've been meaning to write for ages and f...,B2
1,﻿It was not so much how hard people found the ...,B2
2,Keith recently came back from a trip to Chicag...,B2
3,"The Griffith Observatory is a planetarium, and...",B2
4,-LRB- The Hollywood Reporter -RRB- It's offici...,B2


In [2]:
# Filtrer les lignes contenant "C1" ou "C2" dans la colonne "label"
filtered_df = df_aug[df_aug['label'].isin(['C1', 'C2'])]

# Affichez les premières lignes du nouveau dataframe
filtered_df.head()

Unnamed: 0,text,label
558,Police and neighbors were searching for three ...,C1
559,At least 26 people were killed Monday in the S...,C1
560,"To many who watched his long, rambling video s...",C1
561,The night Bob Hawke lost the Labor leadership ...,C1
562,﻿Swedish prisons have long had a reputation ar...,C1


In [3]:
# Chargez le fichier "train_data_challenge.csv" dans un dataframe
train_df = pd.read_csv("train_data_challenge.csv")

# Affichez les premières lignes pour vérifier la structure du dataframe
train_df.head()

Unnamed: 0,Id,text,level
0,0,\r\r\n My friend Meg was going out to wor...,3
1,1,\r\r\n Hello I'm sorry but i'm on holiday...,0
2,2,\r\r\n Name : MaryAge : 62Hair color : bl...,0
3,3,"\r\r\n hi , my name is laila , i live in ...",0
4,4,\r\r\n A pink sweater : 10 $.A black dres...,0


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


False

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [6]:
# Tokenize the text into words and calculate the mean size for "C1"
mean_size_c1_words = train_df[train_df['level'] == 4]['text'].apply(word_tokenize).apply(len).mean()

# Tokenize the text into words and calculate the mean size for "C2"
mean_size_c2_words = train_df[train_df['level'] == 5]['text'].apply(word_tokenize).apply(len).mean()

# Print the results
print(f"Mean size (in words) for the label C1: {mean_size_c1_words:.2f}")
print(f"Mean size (in words) for the label C2: {mean_size_c2_words:.2f}")


Mean size (in words) for the label C1: 187.27
Mean size (in words) for the label C2: 192.30


In [7]:
# Function to calculate the sum of words for a given number of phrases
def calculate_word_sum(sentences, num_phrases):
    selected_phrases = sentences[:num_phrases]
    selected_text = ' '.join(selected_phrases)
    words = word_tokenize(selected_text)
    return len(words)

In [8]:
# Tokenize
filtered_df_tok = filtered_df.copy()
filtered_df_tok['sentences'] = filtered_df_tok['text'].apply(sent_tokenize)

In [9]:
print(filtered_df_tok)

                                                   text label  \
558   Police and neighbors were searching for three ...    C1   
559   At least 26 people were killed Monday in the S...    C1   
560   To many who watched his long, rambling video s...    C1   
561   The night Bob Hawke lost the Labor leadership ...    C1   
562   ﻿Swedish prisons have long had a reputation ar...    C1   
...                                                 ...   ...   
1489  Light propagating in the vicinity of astrophys...    C2   
1490  Future of dentistry has become one of the most...    C2   
1491  ﻿The forests – and suburbs – of Europe are ech...    C2   
1492  Hedge funds are turning bullish on oil once ag...    C2   
1493  Without additional heating, radiative cooling ...    C2   

                                              sentences  
558   [Police and neighbors were searching for three...  
559   [At least 26 people were killed Monday in the ...  
560   [To many who watched his long, rambling

In [10]:
# Function to find the number of phrases with the nearest word count
def find_nearest_phrases(sentences, average_size):
    num_phrases_range = range(1, len(sentences) + 1)
    word_counts = [calculate_word_sum(sentences, num_phrases) for num_phrases in num_phrases_range]
    nearest_num_phrases = min(num_phrases_range, key=lambda x: abs(word_counts[x - 1] - average_size))
    return sentences[:nearest_num_phrases]

# Apply the function to each row of filtered_df
filtered_df_tok['nearest_phrases'] = filtered_df_tok.apply(lambda row: find_nearest_phrases(row['sentences'], mean_size_c1_words) if row['label'] == 'C1' else find_nearest_phrases(row['sentences'], mean_size_c2_words), axis=1)

# Display the resulting dataframe
print(filtered_df_tok['nearest_phrases'].apply(len))

558     5
559     7
560     8
561     7
562     8
       ..
1489    7
1490    9
1491    5
1492    7
1493    6
Name: nearest_phrases, Length: 443, dtype: int64


In [11]:
# Count the number of occurrences for each label in filtered_df_tok
label_counts = filtered_df_tok['label'].value_counts()

# Print the results
print("Number of occurrences for each label:")
print(label_counts)

Number of occurrences for each label:
C1    241
C2    202
Name: label, dtype: int64


In [12]:
# Count the number of occurrences for each label in filtered_df_tok
label_counts = train_df['level'].value_counts()

# Print the results
print("Number of occurrences for each label:")
print(label_counts)

Number of occurrences for each label:
0    8551
1    5717
2    4007
3    1757
4     373
5      40
Name: level, dtype: int64


In [13]:
print(train_df.iloc[395]) 

Id                                                     395
text     \r\r\n      Mark as the &quot;bowling alley&qu...
level                                                    2
Name: 395, dtype: object


In [14]:
import re

def clean_text(text):
    # Remove '\r', '\n', '\t' characters
    text = re.sub(r'[\r\n\t]', ' ', text)
    
    # Remove special characters like &quot; or %% or ''
    text = re.sub(r'&quot;|%{2}|\'\'', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Example usage
example_text = "\r\r\n Mark as the &quot;bowling alley&quot;..."
cleaned_text = clean_text(example_text)
print(cleaned_text)

Mark as the bowling alley...


In [15]:
# Apply the clean_text function to the "text" column
train_df['cleaned_text'] = train_df['text'].apply(clean_text)

# Display the resulting DataFrame
print(train_df[['text', 'cleaned_text']])

                                                    text  \
0      \r\r\n      My friend Meg was going out to wor...   
1      \r\r\n      Hello I'm sorry but i'm on holiday...   
2      \r\r\n      Name : MaryAge : 62Hair color : bl...   
3      \r\r\n      hi , my name is laila , i live in ...   
4      \r\r\n      A pink sweater : 10 $.A black dres...   
...                                                  ...   
20440  \r\r\n      Hi mom and pop!I'm going to the po...   
20441  \r\r\n      Hello, I don't know if the things ...   
20442  \r\r\n      I'm Ericka. I live in Paris .I get...   
20443  \r\r\n      Hi, Jessy! You're on vacation, It'...   
20444  \r\r\n      Hi, my name is Gregory. I'm thirdy...   

                                            cleaned_text  
0      My friend Meg was going out to work like every...  
1      Hello I'm sorry but i'm on holiday when you ca...  
2      Name : MaryAge : 62Hair color : blondeEyes: : ...  
3      hi , my name is laila , i live in pa

In [16]:
# Chargez le fichier "train_data_challenge.csv" dans un dataframe
test_df = pd.read_csv("test_data_challenge.csv")

# Affichez les premières lignes pour vérifier la structure du dataframe
test_df.head()

Unnamed: 0,Id,text
0,0,"\r\r\n Hi granny, Look at my photos about..."
1,1,"\r\r\n Hi, I'm twenty on Monday. I'm havi..."
2,2,\r\r\n make 2 teams of 6 or more players ...
3,3,"\r\r\n Yes, I can recommend a green Amazo..."
4,4,\r\r\n john discovers that Isabella doesn...


In [17]:
# Apply the clean_text function to the "text" column
test_df['cleaned_text'] = test_df['text'].apply(clean_text)

# Display the resulting DataFrame
print(test_df[['text', 'cleaned_text']])

                                                   text  \
0     \r\r\n      Hi granny, Look at my photos about...   
1     \r\r\n      Hi, I'm twenty on Monday. I'm havi...   
2     \r\r\n      make 2 teams of 6 or more players ...   
3     \r\r\n      Yes, I can recommend a green Amazo...   
4     \r\r\n      john discovers that Isabella doesn...   
...                                                 ...   
6810  \r\r\n      we can find difference between the...   
6811  \r\r\n      I traveled to Africa last year. I ...   
6812  \r\r\n      Hello Im writing to the website in...   
6813  \r\r\n      Hello My name is Jean-Charles I'm ...   
6814      \r\r\n      I always go to small shops.\r\r\n   

                                           cleaned_text  
0     Hi granny, Look at my photos about my holliday...  
1     Hi, I'm twenty on Monday. I'm having a party i...  
2     make 2 teams of 6 or more players one ground s...  
3     Yes, I can recommend a green Amazon hotel. It'...  
4

In [18]:
# Reset the index
filtered_df_tok_reset = filtered_df_tok.reset_index(drop=True)

# Rename the 'label' column to 'level'
filtered_df_tok_reset.rename(columns={'label': 'level'}, inplace=True)

# Replace 'C1' with 4 and 'C2' with 5 in the 'level' column
filtered_df_tok_reset['level'].replace({'C1': 4, 'C2': 5}, inplace=True)

# Display the resulting dataframe with the updated 'level' column
print(filtered_df_tok_reset)

                                                  text  level  \
0    Police and neighbors were searching for three ...      4   
1    At least 26 people were killed Monday in the S...      4   
2    To many who watched his long, rambling video s...      4   
3    The night Bob Hawke lost the Labor leadership ...      4   
4    ﻿Swedish prisons have long had a reputation ar...      4   
..                                                 ...    ...   
438  Light propagating in the vicinity of astrophys...      5   
439  Future of dentistry has become one of the most...      5   
440  ﻿The forests – and suburbs – of Europe are ech...      5   
441  Hedge funds are turning bullish on oil once ag...      5   
442  Without additional heating, radiative cooling ...      5   

                                             sentences  \
0    [Police and neighbors were searching for three...   
1    [At least 26 people were killed Monday in the ...   
2    [To many who watched his long, rambling 

In [19]:
# Print the 400th row of filtered_df_tok_reset
# Print the "nearest_phrases" column for the 400th row
print(filtered_df_tok_reset['nearest_phrases'].iloc[399])


['Spain is expelling the Libyan ambassador to Madrid and three other Libyan diplomats here, the Spanish Foreign Ministry said in a statement Thursday.', 'The government of Spain has decided to put an end to the mission of the ambassador accredited in Madrid by the authorities in Tripoli -LRB- Libya -RRB-, because the Gadhafi regime has lost all legitimacy due to its continual repression of the Libyan population, the statement said.', 'The Libyan ambassador to Madrid, Ajeli Abdussalam Ali Breni, has 10 days to leave Spain, the ministry said.', 'The government also is expelling three other diplomats at the Libyan embassy who undertook activities incompatible with their diplomatic status, the statement said.', 'But Spain has not broken diplomatic relations with Libya, said a Foreign Ministry spokeswoman, who by custom is not identified.', 'Spain still has an embassy in Tripoli, although it evacuated its personnel from the city, like many other nations, as fighting between the regime of Mo

In [20]:
# Convert all lists in the "nearest_phrases" column to single strings
filtered_df_tok_reset['nearest_phrases'] = filtered_df_tok_reset['nearest_phrases'].apply(lambda x: ' '.join(x))

In [21]:
# Keep only the 'nearest_phrases' and 'level' columns
filtered_df_selected_columns = filtered_df_tok_reset[['nearest_phrases', 'level']]

# Print the resulting DataFrame
print(filtered_df_selected_columns)

                                       nearest_phrases  level
0    Police and neighbors were searching for three ...      4
1    At least 26 people were killed Monday in the S...      4
2    To many who watched his long, rambling video s...      4
3    The night Bob Hawke lost the Labor leadership ...      4
4    ﻿Swedish prisons have long had a reputation ar...      4
..                                                 ...    ...
438  Light propagating in the vicinity of astrophys...      5
439  Future of dentistry has become one of the most...      5
440  ﻿The forests – and suburbs – of Europe are ech...      5
441  Hedge funds are turning bullish on oil once ag...      5
442  Without additional heating, radiative cooling ...      5

[443 rows x 2 columns]


In [22]:
# Reset the index to create a new default integer index
filtered_df_selected_columns_reset = filtered_df_selected_columns.reset_index(drop=True)

# Add a new 'id' column with unique identifiers starting from 0
filtered_df_selected_columns_reset['id'] = range(len(filtered_df_selected_columns_reset))

# Print the resulting DataFrame
print(filtered_df_selected_columns_reset)

                                       nearest_phrases  level   id
0    Police and neighbors were searching for three ...      4    0
1    At least 26 people were killed Monday in the S...      4    1
2    To many who watched his long, rambling video s...      4    2
3    The night Bob Hawke lost the Labor leadership ...      4    3
4    ﻿Swedish prisons have long had a reputation ar...      4    4
..                                                 ...    ...  ...
438  Light propagating in the vicinity of astrophys...      5  438
439  Future of dentistry has become one of the most...      5  439
440  ﻿The forests – and suburbs – of Europe are ech...      5  440
441  Hedge funds are turning bullish on oil once ag...      5  441
442  Without additional heating, radiative cooling ...      5  442

[443 rows x 3 columns]


In [23]:
# Move the 'id' column to the first position
filtered_df_selected_columns_reset = filtered_df_selected_columns_reset[['id'] + [col for col in filtered_df_selected_columns_reset.columns if col != 'id']]

# Print the resulting DataFrame
print(filtered_df_selected_columns_reset)

      id                                    nearest_phrases  level
0      0  Police and neighbors were searching for three ...      4
1      1  At least 26 people were killed Monday in the S...      4
2      2  To many who watched his long, rambling video s...      4
3      3  The night Bob Hawke lost the Labor leadership ...      4
4      4  ﻿Swedish prisons have long had a reputation ar...      4
..   ...                                                ...    ...
438  438  Light propagating in the vicinity of astrophys...      5
439  439  Future of dentistry has become one of the most...      5
440  440  ﻿The forests – and suburbs – of Europe are ech...      5
441  441  Hedge funds are turning bullish on oil once ag...      5
442  442  Without additional heating, radiative cooling ...      5

[443 rows x 3 columns]


In [24]:
# Rename the 'nearest_phrases' column to 'text'
filtered_df_selected_columns_reset.rename(columns={'nearest_phrases': 'text'}, inplace=True)

# Print the resulting DataFrame
print(filtered_df_selected_columns_reset)

      id                                               text  level
0      0  Police and neighbors were searching for three ...      4
1      1  At least 26 people were killed Monday in the S...      4
2      2  To many who watched his long, rambling video s...      4
3      3  The night Bob Hawke lost the Labor leadership ...      4
4      4  ﻿Swedish prisons have long had a reputation ar...      4
..   ...                                                ...    ...
438  438  Light propagating in the vicinity of astrophys...      5
439  439  Future of dentistry has become one of the most...      5
440  440  ﻿The forests – and suburbs – of Europe are ech...      5
441  441  Hedge funds are turning bullish on oil once ag...      5
442  442  Without additional heating, radiative cooling ...      5

[443 rows x 3 columns]


In [25]:
# Select only the desired columns 'Id', 'cleaned_text', and 'level'
selected_columns_test_df = test_df[['Id', 'cleaned_text']]

# Print the resulting DataFrame
print(selected_columns_test_df)

        Id                                       cleaned_text
0        0  Hi granny, Look at my photos about my holliday...
1        1  Hi, I'm twenty on Monday. I'm having a party i...
2        2  make 2 teams of 6 or more players one ground s...
3        3  Yes, I can recommend a green Amazon hotel. It'...
4        4  john discovers that Isabella doesn't stay with...
...    ...                                                ...
6810  6810  we can find difference between the two website...
6811  6811  I traveled to Africa last year. I have never l...
6812  6812  Hello Im writing to the website in order to be...
6813  6813  Hello My name is Jean-Charles I'm twenty three...
6814  6814                        I always go to small shops.

[6815 rows x 2 columns]


In [26]:
# Rename the 'cleaned_text' column to 'text'
selected_columns_test_df.rename(columns={'cleaned_text': 'text'}, inplace=True)

# Print the resulting DataFrame
print(selected_columns_test_df)

        Id                                               text
0        0  Hi granny, Look at my photos about my holliday...
1        1  Hi, I'm twenty on Monday. I'm having a party i...
2        2  make 2 teams of 6 or more players one ground s...
3        3  Yes, I can recommend a green Amazon hotel. It'...
4        4  john discovers that Isabella doesn't stay with...
...    ...                                                ...
6810  6810  we can find difference between the two website...
6811  6811  I traveled to Africa last year. I have never l...
6812  6812  Hello Im writing to the website in order to be...
6813  6813  Hello My name is Jean-Charles I'm twenty three...
6814  6814                        I always go to small shops.

[6815 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_columns_test_df.rename(columns={'cleaned_text': 'text'}, inplace=True)


In [27]:
# Select only the desired columns 'Id', 'cleaned_text', and 'level'
selected_columns_train_df = train_df[['Id', 'cleaned_text', 'level']]

# Print the resulting DataFrame
print(selected_columns_train_df)

          Id                                       cleaned_text  level
0          0  My friend Meg was going out to work like every...      3
1          1  Hello I'm sorry but i'm on holiday when you ca...      0
2          2  Name : MaryAge : 62Hair color : blondeEyes: : ...      0
3          3  hi , my name is laila , i live in paris from f...      0
4          4  A pink sweater : 10 $.A black dress with a whi...      0
...      ...                                                ...    ...
20440  20440  Hi mom and pop!I'm going to the pop festival t...      1
20441  20441  Hello, I don't know if the things telling abou...      2
20442  20442  I'm Ericka. I live in Paris .I get up at 7 in ...      0
20443  20443  Hi, Jessy! You're on vacation, It's too bad to...      0
20444  20444  Hi, my name is Gregory. I'm thirdy-nine. My fa...      0

[20445 rows x 3 columns]


In [28]:
# Rename the 'cleaned_text' column to 'text'
selected_columns_train_df.rename(columns={'cleaned_text': 'text'}, inplace=True)

# Print the resulting DataFrame
print(selected_columns_train_df)

          Id                                               text  level
0          0  My friend Meg was going out to work like every...      3
1          1  Hello I'm sorry but i'm on holiday when you ca...      0
2          2  Name : MaryAge : 62Hair color : blondeEyes: : ...      0
3          3  hi , my name is laila , i live in paris from f...      0
4          4  A pink sweater : 10 $.A black dress with a whi...      0
...      ...                                                ...    ...
20440  20440  Hi mom and pop!I'm going to the pop festival t...      1
20441  20441  Hello, I don't know if the things telling abou...      2
20442  20442  I'm Ericka. I live in Paris .I get up at 7 in ...      0
20443  20443  Hi, Jessy! You're on vacation, It's too bad to...      0
20444  20444  Hi, my name is Gregory. I'm thirdy-nine. My fa...      0

[20445 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_columns_train_df.rename(columns={'cleaned_text': 'text'}, inplace=True)


In [29]:
# Save filtered_df_selected_columns_reset to cleaned_data_augment.txt
filtered_df_selected_columns_reset.to_csv("cleaned_data_augment.txt", sep='\t', index=False)

# Save selected_columns_train_df to cleaned_train_data.csv
selected_columns_train_df.to_csv("cleaned_train_data.csv", index=False)

# Save selected_columns_test_df to cleaned_test_data.csv
selected_columns_test_df.to_csv("cleaned_test_data.csv", index=False)

In [30]:
import numpy as np

In [31]:
# Load the cleaned dataframes
selected_columns_train_df = pd.read_csv("cleaned_train_data.csv", encoding="utf-8")
filtered_df_selected_columns_reset = pd.read_csv("cleaned_data_augment.txt", encoding="utf-8", sep="\t")

# Function to insert row at random index in dataframe
def insert_row_at_random_index(dataframe, row):
    random_index = np.random.randint(0, len(dataframe) + 1)
    return pd.concat([dataframe.iloc[:random_index], pd.DataFrame([row]), dataframe.iloc[random_index:]], ignore_index=True)

# Iterate over rows in filtered_df_selected_columns_reset and insert at random index in selected_columns_train_df
for _, row in filtered_df_selected_columns_reset.iterrows():
    selected_columns_train_df = insert_row_at_random_index(selected_columns_train_df, row)

# Save the concatenated dataframe to cleaned_final_train_data.csv
selected_columns_train_df.to_csv("cleaned_final_train_data.csv", index=False)

In [32]:
selected_columns_train_df.value_counts

<bound method DataFrame.value_counts of             Id                                               text  level  id
0          0.0  My friend Meg was going out to work like every...      3 NaN
1          1.0  Hello I'm sorry but i'm on holiday when you ca...      0 NaN
2          2.0  Name : MaryAge : 62Hair color : blondeEyes: : ...      0 NaN
3          3.0  hi , my name is laila , i live in paris from f...      0 NaN
4          4.0  A pink sweater : 10 $.A black dress with a whi...      0 NaN
...        ...                                                ...    ...  ..
20883  20440.0  Hi mom and pop!I'm going to the pop festival t...      1 NaN
20884  20441.0  Hello, I don't know if the things telling abou...      2 NaN
20885  20442.0  I'm Ericka. I live in Paris .I get up at 7 in ...      0 NaN
20886  20443.0  Hi, Jessy! You're on vacation, It's too bad to...      0 NaN
20887  20444.0  Hi, my name is Gregory. I'm thirdy-nine. My fa...      0 NaN

[20888 rows x 4 columns]>

In [33]:
filtered_df_selected_columns_reset.value_counts

<bound method DataFrame.value_counts of       id                                               text  level
0      0  Police and neighbors were searching for three ...      4
1      1  At least 26 people were killed Monday in the S...      4
2      2  To many who watched his long, rambling video s...      4
3      3  The night Bob Hawke lost the Labor leadership ...      4
4      4  ﻿Swedish prisons have long had a reputation ar...      4
..   ...                                                ...    ...
438  438  Light propagating in the vicinity of astrophys...      5
439  439  Future of dentistry has become one of the most...      5
440  440  ﻿The forests – and suburbs – of Europe are ech...      5
441  441  Hedge funds are turning bullish on oil once ag...      5
442  442  Without additional heating, radiative cooling ...      5

[443 rows x 3 columns]>

In [34]:
train_df

Unnamed: 0,Id,text,level,cleaned_text
0,0,\r\r\n My friend Meg was going out to wor...,3,My friend Meg was going out to work like every...
1,1,\r\r\n Hello I'm sorry but i'm on holiday...,0,Hello I'm sorry but i'm on holiday when you ca...
2,2,\r\r\n Name : MaryAge : 62Hair color : bl...,0,Name : MaryAge : 62Hair color : blondeEyes: : ...
3,3,"\r\r\n hi , my name is laila , i live in ...",0,"hi , my name is laila , i live in paris from f..."
4,4,\r\r\n A pink sweater : 10 $.A black dres...,0,A pink sweater : 10 $.A black dress with a whi...
...,...,...,...,...
20440,20440,\r\r\n Hi mom and pop!I'm going to the po...,1,Hi mom and pop!I'm going to the pop festival t...
20441,20441,"\r\r\n Hello, I don't know if the things ...",2,"Hello, I don't know if the things telling abou..."
20442,20442,\r\r\n I'm Ericka. I live in Paris .I get...,0,I'm Ericka. I live in Paris .I get up at 7 in ...
20443,20443,"\r\r\n Hi, Jessy! You're on vacation, It'...",0,"Hi, Jessy! You're on vacation, It's too bad to..."


In [35]:
finale_test = pd.read_csv("cleaned_final_train_data.csv", encoding="utf-8")

In [36]:
finale_test

Unnamed: 0,Id,text,level,id
0,0.0,My friend Meg was going out to work like every...,3,
1,1.0,Hello I'm sorry but i'm on holiday when you ca...,0,
2,2.0,Name : MaryAge : 62Hair color : blondeEyes: : ...,0,
3,3.0,"hi , my name is laila , i live in paris from f...",0,
4,4.0,A pink sweater : 10 $.A black dress with a whi...,0,
...,...,...,...,...
20883,20440.0,Hi mom and pop!I'm going to the pop festival t...,1,
20884,20441.0,"Hello, I don't know if the things telling abou...",2,
20885,20442.0,I'm Ericka. I live in Paris .I get up at 7 in ...,0,
20886,20443.0,"Hi, Jessy! You're on vacation, It's too bad to...",0,
