# Zsoft Challenge

In [2]:
#!pip install langdetect

In [3]:
import pandas as pd
import numpy as np

import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
# Load the training data
df = pd.read_csv("Train_zsoft.csv")

In [None]:
# Check the number of samples and features
print(f"Number of samples: {df.shape[0]}")
print(f"Number of features: {df.shape[1]}")

Number of samples: 30129
Number of features: 3


In [None]:
df

Unnamed: 0,ID,text,label
0,17428,TRANSPORT SYTEX / OEDA,Experience
1,15362,Appliquer la résine Avancé,Skills
2,37153,2020 - 2021 ¢ Formation en ligne sur le,Education
3,10240,Score: 820 (B2),Skills
4,29713,- Anglais :courant - Créole :courant,Skills
...,...,...,...
30124,16850,du Groupe,Experience
30125,6265,@ Mise en place du marché et démarchage d'arti...,Experience
30126,11284,Baccalauréat général série ES 2019,Education
30127,860,Communication en ligne,Experience


In [None]:
# Check the distribution of labels
print(df["label"].value_counts())

Experience    16352
Skills         7691
Education      6086
Name: label, dtype: int64


In [None]:
pattern = r'[^a-zA-Z0-9À-ÿ\s]'
nltk.download('stopwords')

def remove_special_chars(text):
    text = text.lower()
    text = re.sub(pattern, '', text)
    return text

df['text'] = df['text'].apply(remove_special_chars)

def remove_stopwords(text):
    # split text into tokens
    tokens = nltk.word_tokenize(text)
    # remove stop words
    filtered_tokens = [token for token in tokens if token.lower() not in stopwords.words('english')]
    filtered_tokens = [token for token in filtered_tokens if token.lower() not in stopwords.words('french')]
    # join filtered tokens
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

# apply remove_stopwords function on 'text' column
df['text_new'] = df['text'].apply(remove_special_chars)

df['text_new'] = df['text'].apply(remove_stopwords)

# print the dataframe
print(df)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\benz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


          ID                                              text       label  \
0      17428                             transport sytex  oeda  Experience   
1      15362                        appliquer la résine avancé      Skills   
2      37153             2020  2021  formation en ligne sur le   Education   
3      10240                                      score 820 b2      Skills   
4      29713                   anglais courant  créole courant      Skills   
...      ...                                               ...         ...   
30124  16850                                         du groupe  Experience   
30125   6265   mise en place du marché et démarchage dartisans  Experience   
30126  11284                baccalauréat général série es 2019   Education   
30127    860                            communication en ligne  Experience   
30128  15795                    2004 agent qualifié de service  Experience   

                                     text_new  
0              

In [None]:
pip install spacy






[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [None]:
!python -m spacy download fr_core_news_sm


Collecting fr-core-news-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.5.0/fr_core_news_sm-3.5.0-py3-none-any.whl (16.3 MB)
     -------------------------------------- 16.3/16.3 MB 333.6 kB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')



[notice] A new release of pip is available: 23.0 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import spacy
# Load the French language model in Spacy
nlp = spacy.load('fr_core_news_sm')

# Define a function to lemmatize a sentence using Spacy
def lemmatize_sentence(sentence):
    # Lemmatize the sentence using Spacy
    doc = nlp(sentence)
    lemmas = []
    for token in doc:
        if token.pos_ == 'VERB':
            lemmas.append(token.lemma_)
        elif token.pos_ == 'ADV':
            lemmas.append(token.lemma_ + 'ment')
        elif token.pos_ == 'NOUN':
            lemmas.append(token.lemma_)
        else:
            lemmas.append(token.text)
    return ' '.join(lemmas)

# Load the French text data frame

# Apply lemmatization on the 'text' column of the data frame
df['lemmatized_text'] = df['text_new'].apply(lemmatize_sentence)

In [None]:
df

Unnamed: 0,ID,text,label,text_new,lemmatized_text
0,17428,transport sytex oeda,Experience,transport sytex oeda,transport sytex oeda
1,15362,appliquer la résine avancé,Skills,appliquer résine avancé,appliquer résine avancer
2,37153,2020 2021 formation en ligne sur le,Education,2020 2021 formation ligne,2020 2021 formation ligne
3,10240,score 820 b2,Skills,score 820 b2,score 820 b2
4,29713,anglais courant créole courant,Skills,anglais courant créole courant,anglais courant créole courant
...,...,...,...,...,...
30124,16850,du groupe,Experience,groupe,groupe
30125,6265,mise en place du marché et démarchage dartisans,Experience,mise place marché démarchage dartisans,mise place marché démarchage dartisans
30126,11284,baccalauréat général série es 2019,Education,baccalauréat général série 2019,baccalauréat général série 2019
30127,860,communication en ligne,Experience,communication ligne,communication ligne


In [None]:
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Apply the function to the 'text' column using apply()
df['lemmatized_text'] = df['lemmatized_text'].apply(remove_numbers)
df['lemmatized_text'] = df['lemmatized_text'].apply(remove_numbers)
df['lemmatized_text'] = df['text_new'].apply(lemmatize_sentence)# Print the result


In [None]:
df

Unnamed: 0,ID,text,label,text_new,lemmatized_text
0,17428,transport sytex oeda,Experience,transport sytex oeda,transport sytex oeda
1,15362,appliquer la résine avancé,Skills,appliquer résine avancé,appliquer résine avancer
2,37153,2020 2021 formation en ligne sur le,Education,2020 2021 formation ligne,2020 2021 formation ligne
3,10240,score 820 b2,Skills,score 820 b2,score 820 b2
4,29713,anglais courant créole courant,Skills,anglais courant créole courant,anglais courant créole courant
...,...,...,...,...,...
30124,16850,du groupe,Experience,groupe,groupe
30125,6265,mise en place du marché et démarchage dartisans,Experience,mise place marché démarchage dartisans,mise place marché démarchage dartisans
30126,11284,baccalauréat général série es 2019,Education,baccalauréat général série 2019,baccalauréat général série 2019
30127,860,communication en ligne,Experience,communication ligne,communication ligne


In [None]:
# Create an instance of TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

# Fit the vectorizer on the text data
tfidf.fit(df["lemmatized_text"])

# Transform the text data into a matrix of TF-IDF features
features = tfidf.transform(df['lemmatized_text'])

# Convert the features matrix to a DataFrame
features_df = pd.DataFrame(features.toarray(), columns=tfidf.get_feature_names())

# Concatenate the features DataFrame with the original DataFrame
result_df = pd.concat([df, features_df], axis=1)

# Display the result DataFrame
print(result_df)



          ID                                              text       label  \
0      17428                             transport sytex  oeda  Experience   
1      15362                        appliquer la résine avancé      Skills   
2      37153             2020  2021  formation en ligne sur le   Education   
3      10240                                      score 820 b2      Skills   
4      29713                   anglais courant  créole courant      Skills   
...      ...                                               ...         ...   
30124  16850                                         du groupe  Experience   
30125   6265   mise en place du marché et démarchage dartisans  Experience   
30126  11284                baccalauréat général série es 2019   Education   
30127    860                            communication en ligne  Experience   
30128  15795                    2004 agent qualifié de service  Experience   

                                     text_new  \
0             

In [None]:
result_df= result_df.drop(["ID",	"text",	"label",	"text_new",	"lemmatized_text"], axis=1)


In [None]:
result_df


Unnamed: 0,00,000,0000,00000,000000,0000000,0000008000,000k,000m2,0027,...,évéenementiel,évéenements,événement,événementiel,événementielle,événementielles,événementiels,événementment,événements,éxister
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
result_df

Unnamed: 0,00,000,0000,00000,000000,0000000,0000008000,000k,000m2,0027,...,évéenementiel,évéenements,événement,événementiel,événementielle,événementielles,événementiels,événementment,événements,éxister
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30126,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(result_df)

# Create a new DataFrame with the PCA features
pca_df = pd.DataFrame(pca_features, columns=['PC{}'.format(i+1) for i in range(2)])

# Concatenate the PCA DataFrame with the original DataFrame
result_df = pd.concat([df, pca_df], axis=1)

# Display the result DataFrame
print(result_df)

NameError: name 'result_df' is not defined

In [None]:
result_df

Unnamed: 0,enc_label,PC1,PC2
0,0.0,-0.932184,-0.016427
1,1.0,0.482225,-0.016090
2,2.0,1.897358,-0.002097
3,1.0,0.481796,-0.013288
4,1.0,0.482971,-0.043447
...,...,...,...
30124,0.0,-0.932936,-0.021335
30125,0.0,-0.933399,-0.015040
30126,2.0,1.898818,-0.009532
30127,0.0,-0.931417,-0.003708


In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(categories=[['Experience', 'Skills', 'Education']])
encoder.fit(df[['label']])
df['enc_label'] = encoder.transform(df[['label']])



In [None]:
result_df['enc_label'] = encoder.transform(df[['label']])

In [None]:
df

Unnamed: 0,ID,text,label,text_new,lemmatized_text,enc_label
0,17428,transport sytex oeda,Experience,transport sytex oeda,transport sytex oeder,0.0
1,15362,appliquer la résine avancé,Skills,appliquer résine avancé,appliquer résine avancer,1.0
2,37153,2020 2021 formation en ligne sur le,Education,2020 2021 formation ligne,formation ligne,2.0
3,10240,score 820 b2,Skills,score 820 b2,score b,1.0
4,29713,anglais courant créole courant,Skills,anglais courant créole courant,anglais courant créole courant,1.0
...,...,...,...,...,...,...
30124,16850,du groupe,Experience,groupe,groupe,0.0
30125,6265,mise en place du marché et démarchage dartisans,Experience,mise place marché démarchage dartisans,mise place marcher démarchage dartisans,0.0
30126,11284,baccalauréat général série es 2019,Education,baccalauréat général série 2019,baccalauréat général série,2.0
30127,860,communication en ligne,Experience,communication ligne,communication ligne,0.0


In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
gnb = GaussianNB()
shuffled_df =result_df.sample(frac=1).reset_index(drop=True)
df5=shuffled_df.drop[shuffled_df['enc_label']]
train_texts, val_texts, train_labels, val_labels = train_test_split(df5, result_df['enc_label'], test_size=0.2)





TypeError: 'method' object is not subscriptable

In [None]:
shuffled_df

Unnamed: 0,enc_label,PC1,PC2
0,1.0,-0.048972,-0.001225
1,0.0,-0.006501,-0.008425
2,1.0,-0.015434,-0.006151
3,1.0,-0.014400,-0.005539
4,1.0,-0.015584,-0.005921
...,...,...,...
30124,0.0,-0.014261,-0.006505
30125,2.0,-0.009905,-0.009449
30126,1.0,-0.014260,-0.005440
30127,0.0,-0.015552,-0.007774


In [None]:

# Train the classifier on the training data
gnb.fit(train_texts, train_labels )

# Use the trained classifier to make predictions on the test data
y_pred = gnb.predict( val_texts)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(val_labels, y_pred)
print(f"Accuracy: {accuracy}")


Accuracy: 1.0


In [None]:
testing = pd.read_csv('/kaggle/input/test-zsoft1csv/Test_zsoft(1).csv')


In [None]:
testing['text_new'] =testing['text'].apply(remove_special_chars)

testing['text_new'] = testing['text'].apply(remove_stopwords)

In [None]:
testing['lemmatized_text'] = testing['text_new'].apply(lemmatize_sentence)
testing['lemmatized_text'] = testing['lemmatized_text'].apply(remove_numbers)


In [None]:
testing

Unnamed: 0,ID,text,text_new,lemmatized_text
0,31507,e Coordinating and distributing shooting sched...,e Coordinating distributing shooting schedules,e Coordinating distributing shootingment sched...
1,12238,Dipl6me Universitaire en Production de Contenu...,Dipl6me Universitaire Production Contenu Multi...,Diplme Universitaire production contenu Multim...
2,37232,Professeur Patrick Cohen.,Professeur Patrick Cohen .,Professeur Patrick Cohen .
3,22248,"e Autonomie, sens de la responsabilité","e Autonomie , sens responsabilité","e autonomie , sens responsabiliter"
4,4589,2021-2023 2020-2021 2017-2019,2021-2023 2020-2021 2017-2019,- - -
...,...,...,...,...
7528,27317,Déc 13 /jan 14 Assistante de vie Résidence Moi...,Déc 13 /jan 14 Assistante vie Résidence Moise ...,Déc /jan assistant vie résidence moise Léon ...
7529,13339,"01/09/2011 Chargé d'accueil clientéle, remised...","01/09/2011 Chargé d'accueil clientéle , remise...","// charger d' accueil clientéle , remisedechéq..."
7530,7020,2022/2023 Bachelor Ressources,2022/2023 Bachelor Ressources,/ Bachelor Ressources
7531,30136,Centre de Loisirs dans les Ecoles Maternelles ...,Centre Loisirs Ecoles Maternelles - L'Hay-les-...,centre Loisirs Ecoles maternelle - L' Hay-les-...


In [None]:
tfidf = TfidfVectorizer()

# Fit the vectorizer on the text data
tfidf.fit(testing["lemmatized_text"])

# Transform the text data into a matrix of TF-IDF features
features = tfidf.transform(testing['lemmatized_text'])

# Convert the features matrix to a DataFrame
features_df = pd.DataFrame(features.toarray(), columns=tfidf.get_feature_names())

# Concatenate the features DataFrame with the original DataFrame
result = pd.concat([testing, features_df], axis=1)

# Display the result DataFrame




In [None]:
result

Unnamed: 0,ID,text,text_new,lemmatized_text,_a,aa,aaaaaaa,aabccd,aalborg,ab,...,éviter,évoluer,évolution,évéenementiel,événement,événementiel,événementielle,événementiels,événements,événemer
0,31507,e Coordinating and distributing shooting sched...,e Coordinating distributing shooting schedules,e Coordinating distributing shootingment sched...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12238,Dipl6me Universitaire en Production de Contenu...,Dipl6me Universitaire Production Contenu Multi...,Diplme Universitaire production contenu Multim...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37232,Professeur Patrick Cohen.,Professeur Patrick Cohen .,Professeur Patrick Cohen .,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,22248,"e Autonomie, sens de la responsabilité","e Autonomie , sens responsabilité","e autonomie , sens responsabiliter",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4589,2021-2023 2020-2021 2017-2019,2021-2023 2020-2021 2017-2019,- - -,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7528,27317,Déc 13 /jan 14 Assistante de vie Résidence Moi...,Déc 13 /jan 14 Assistante vie Résidence Moise ...,Déc /jan assistant vie résidence moise Léon ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7529,13339,"01/09/2011 Chargé d'accueil clientéle, remised...","01/09/2011 Chargé d'accueil clientéle , remise...","// charger d' accueil clientéle , remisedechéq...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7530,7020,2022/2023 Bachelor Ressources,2022/2023 Bachelor Ressources,/ Bachelor Ressources,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7531,30136,Centre de Loisirs dans les Ecoles Maternelles ...,Centre Loisirs Ecoles Maternelles - L'Hay-les-...,centre Loisirs Ecoles maternelle - L' Hay-les-...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
result= result.drop(["ID",	"text",	"text_new",	"lemmatized_text"], axis=1)


In [None]:
result

Unnamed: 0,_a,aa,aaaaaaa,aabccd,aalborg,ab,abandon,abaqus,abbeville,abbi,...,événement,événementiel,événementielle,événementiels,événements,événemer,PC1,PC2,PC1.1,PC2.1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.008027,-0.017583,-0.011352,-0.024857
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.013159,-0.012569,-0.018608,-0.017740
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.007964,-0.017464,-0.011262,-0.024691
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.010363,-0.020707,-0.014657,-0.029244
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.007637,-0.016526,-0.010800,-0.023364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.012368,-0.026828,-0.017488,-0.038073
7529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.013563,-0.016839,-0.019194,-0.023746
7530,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.011310,-0.015946,-0.015995,-0.022563
7531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,-0.007143,-0.018630,-0.010100,-0.026342


In [None]:

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(result)

# Create a new DataFrame with the PCA features
pca_df2 = pd.DataFrame(pca_features, columns=['PC{}'.format(i+1) for i in range(2)])

# Concatenate the PCA DataFrame with the original DataFrame
result= pd.concat([testing, pca_df2], axis=1)

# Display the result DataFrame


In [None]:
result

Unnamed: 0,ID,text,text_new,lemmatized_text,PC1,PC2
0,31507,e Coordinating and distributing shooting sched...,e Coordinating distributing shooting schedules,e Coordinating distributing shootingment sched...,-0.022704,-0.049715
1,12238,Dipl6me Universitaire en Production de Contenu...,Dipl6me Universitaire Production Contenu Multi...,Diplme Universitaire production contenu Multim...,-0.037215,-0.035480
2,37232,Professeur Patrick Cohen.,Professeur Patrick Cohen .,Professeur Patrick Cohen .,-0.022523,-0.049382
3,22248,"e Autonomie, sens de la responsabilité","e Autonomie , sens responsabilité","e autonomie , sens responsabiliter",-0.029313,-0.058489
4,4589,2021-2023 2020-2021 2017-2019,2021-2023 2020-2021 2017-2019,- - -,-0.021599,-0.046729
...,...,...,...,...,...,...
7528,27317,Déc 13 /jan 14 Assistante de vie Résidence Moi...,Déc 13 /jan 14 Assistante vie Résidence Moise ...,Déc /jan assistant vie résidence moise Léon ...,-0.034976,-0.076146
7529,13339,"01/09/2011 Chargé d'accueil clientéle, remised...","01/09/2011 Chargé d'accueil clientéle , remise...","// charger d' accueil clientéle , remisedechéq...",-0.038388,-0.047490
7530,7020,2022/2023 Bachelor Ressources,2022/2023 Bachelor Ressources,/ Bachelor Ressources,-0.031990,-0.045126
7531,30136,Centre de Loisirs dans les Ecoles Maternelles ...,Centre Loisirs Ecoles Maternelles - L'Hay-les-...,centre Loisirs Ecoles maternelle - L' Hay-les-...,-0.020201,-0.052685


In [None]:
result= result.drop(["ID",	"text",	"text_new",	"lemmatized_text"], axis=1)
result

Unnamed: 0,PC1,PC2
0,-0.022704,-0.049715
1,-0.037215,-0.035480
2,-0.022523,-0.049382
3,-0.029313,-0.058489
4,-0.021599,-0.046729
...,...,...
7528,-0.034976,-0.076146
7529,-0.038388,-0.047490
7530,-0.031990,-0.045126
7531,-0.020201,-0.052685


In [None]:
y_pred = gnb.predict( result)


Feature names seen at fit time, yet now missing:
- enc_label



ValueError: X has 2 features, but GaussianNB is expecting 3 features as input.