In [1]:
import pandas as pd
from collections import Counter
from sklearn import svm
import re
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report



def charge_donnees():
    global train_data,dev_data,test_data
    # Chemin vers les fichiers de données
    train_file = 'twitter-2013train-A.txt'
    dev_file = 'twitter-2013dev-A.txt'
    test_file = 'twitter-2013test-A.txt'
    # Charger les fichiers de données dans des DataFrames : étape 1
    train_data = pd.read_csv(train_file, sep='\t', names=['tweet_id', 'sentiment', 'text'])
    dev_data = pd.read_csv(dev_file, sep='\t', names=['tweet_id', 'sentiment', 'text'])
    test_data = pd.read_csv(test_file, sep='\t', names=['tweet_id', 'sentiment', 'text'])
# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    return text

def etape23():
    global train_data
    lexique = set()
    for text in train_data['text']:
        processed_text = preprocess_text(text)
        words = processed_text.split()
        lexique.update(words)
        # Étape 3 :Return Attribuer un numéro unique à chaque mot du lexique, en commençant à 1
    return {word: idx + 1 for idx, word in enumerate(lexique)}

# Count word occurrences
def etape4(train_data,word_to_index):
    word_counts_per_message = []
    for message in train_data['text']:
        processed_text = preprocess_text(message)
        words = processed_text.split()
        word_counts_per_message.append(Counter([word_to_index[message] for message in words if message in word_to_index]))
    return word_counts_per_message



charge_donnees()
word_to_index = etape23()
word_counts_per_message = etape4(train_data,word_to_index)

def convert_to_svm_format(data, word_counts):
    label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
    svm_data = []
    for i, row in enumerate(data.itertuples()):
        label = label_mapping[row.sentiment]
        features = word_counts[i]
        # Ensure the features are sorted by index
        sorted_features = sorted(features.items())
        feature_str = ' '.join([f"{index}:{count}" for index, count in sorted_features])
        svm_data.append(f"{label} {feature_str}")
    return svm_data



# Convert to SVM format
svm_train_data = convert_to_svm_format(train_data, word_counts_per_message)
#print(svm_train_data)

# Training the SVM
clf = LinearSVC(dual=False)
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
X_train = [[0] * len(word_to_index) for _ in range(len(train_data))]
for i, sample in enumerate(word_counts_per_message):
    for index, count in sample.items():
        X_train[i][index-1] = count  # Adjust index by -1
y_train = [label_mapping[row.sentiment] for row in train_data.itertuples()]
clf.fit(X_train, y_train)

# Evaluate the model on development data
word_counts_dev = etape4(dev_data, word_to_index)
svm_dev_data = convert_to_svm_format(dev_data, word_counts_dev)

X_dev = [[0] * len(word_to_index) for _ in range(len(dev_data))]
y_dev = [label_mapping[row.sentiment] for row in dev_data.itertuples()]

for i, sample in enumerate(word_counts_dev):
    for index, count in sample.items():
        X_dev[i][index-1] = count  # Adjust index by -1

y_pred_dev = clf.predict(X_dev)
print("Classification Report for Development Set:")
print(classification_report(y_dev, y_pred_dev))

# Evaluate the model on test data
word_counts_test = etape4(test_data, word_to_index)
svm_test_data = convert_to_svm_format(test_data, word_counts_test)

X_test = [[0] * len(word_to_index) for _ in range(len(test_data))]
y_test = [label_mapping[row.sentiment] for row in test_data.itertuples()]

for i, sample in enumerate(word_counts_test):
    for index, count in sample.items():
        X_test[i][index-1] = count  # Adjust index by -1

y_pred_test = clf.predict(X_test)
print("Classification Report for Test Set:")
print(classification_report(y_test, y_pred_test))

# RESULTS

# Classification Report for Development Set:
#               precision    recall  f1-score   support
#
#           -1       0.58      0.41      0.48       340
#            0       0.64      0.69      0.67       739
#            1       0.62      0.66      0.64       575
#
#     accuracy                           0.62      1654
#    macro avg       0.61      0.59      0.59      1654
# weighted avg       0.62      0.62      0.62      1654
#
# Classification Report for Test Set:
#               precision    recall  f1-score   support
#
#           -1       0.57      0.37      0.45       559
#            0       0.61      0.76      0.68      1513
#            1       0.70      0.62      0.66      1475
#
#     accuracy                           0.64      3547
#    macro avg       0.63      0.58      0.60      3547
# weighted avg       0.64      0.64      0.64      3547


Classification Report for Development Set:
              precision    recall  f1-score   support

          -1       0.58      0.41      0.48       340
           0       0.64      0.69      0.67       739
           1       0.62      0.66      0.64       575

    accuracy                           0.62      1654
   macro avg       0.61      0.59      0.59      1654
weighted avg       0.62      0.62      0.62      1654

Classification Report for Test Set:
              precision    recall  f1-score   support

          -1       0.57      0.37      0.45       559
           0       0.61      0.76      0.68      1513
           1       0.70      0.62      0.66      1475

    accuracy                           0.64      3547
   macro avg       0.63      0.58      0.60      3547
weighted avg       0.64      0.64      0.64      3547



In [1]:
import pandas as pd
from collections import Counter
from sklearn import svm
import re
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Cellule 1 : Chargement des données


In [2]:
def charge_donnees():
    global train_data, dev_data, test_data
    # Chemin vers les fichiers de données
    train_file = 'twitter-2013train-A.txt'
    dev_file = 'twitter-2013dev-A.txt'
    test_file = 'twitter-2013test-A.txt'
    # Charger les fichiers de données dans des DataFrames : étape 1
    train_data = pd.read_csv(train_file, sep='\t', names=['tweet_id', 'sentiment', 'text'])
    dev_data = pd.read_csv(dev_file, sep='\t', names=['tweet_id', 'sentiment', 'text'])
    test_data = pd.read_csv(test_file, sep='\t', names=['tweet_id', 'sentiment', 'text'])


# Cellule 2 : Fonction de prétraitement du texte


In [3]:
def preprocess_text(text):
    text = text.lower()  # Minuscules
    text = re.sub(r'\W', ' ', text)  # Suppression des caractères non alphabétiques
    text = re.sub(r'\s+', ' ', text)  # Remplacement des espaces multiples par un espace unique
    return text


# Cellule 3 : Extraction du lexique


In [4]:
def etape23():
    global train_data
    lexique = set()
    for text in train_data['text']:
        processed_text = preprocess_text(text)
        words = processed_text.split()
        lexique.update(words)
        # Étape 3 : Attribuer un numéro unique à chaque mot du lexique, en commençant à 1
    return {word: idx + 1 for idx, word in enumerate(lexique)}


# Cellule 4 : Comptage des occurrences des mots


In [5]:
def etape4(train_data, word_to_index):
    word_counts_per_message = []
    for message in train_data['text']:
        processed_text = preprocess_text(message)
        words = processed_text.split()
        word_counts_per_message.append(Counter([word_to_index[message] for message in words if message in word_to_index]))
    return word_counts_per_message


# Cellule 5 : Conversion au format SVM


In [6]:
def convert_to_svm_format(data, word_counts):
    label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
    svm_data = []
    for i, row in enumerate(data.itertuples()):
        label = label_mapping[row.sentiment]
        features = word_counts[i]
        # Assurer que les fonctionnalités sont triées par index
        sorted_features = sorted(features.items())
        feature_str = ' '.join([f"{index}:{count}" for index, count in sorted_features])
        svm_data.append(f"{label} {feature_str}")
    return svm_data


# Cellule 6 : Conversion des données d'apprentissage au format SVM


In [7]:
charge_donnees()
word_to_index = etape23()
word_counts_per_message = etape4(train_data, word_to_index)
svm_train_data = convert_to_svm_format(train_data, word_counts_per_message)


# Cellule 7 : Entraînement du SVM


In [8]:
clf = LinearSVC(dual=False)
label_mapping = {'positive': 1, 'negative': -1, 'neutral': 0}
X_train = [[0] * len(word_to_index) for _ in range(len(train_data))]
for i, sample in enumerate(word_counts_per_message):
    for index, count in sample.items():
        X_train[i][index-1] = count  # Ajustement de l'index par -1
y_train = [label_mapping[row.sentiment] for row in train_data.itertuples()]
clf.fit(X_train, y_train)


# Cellule 8 : Évaluation sur l'ensemble de développement


In [9]:
word_counts_dev = etape4(dev_data, word_to_index)
svm_dev_data = convert_to_svm_format(dev_data, word_counts_dev)

X_dev = [[0] * len(word_to_index) for _ in range(len(dev_data))]
y_dev = [label_mapping[row.sentiment] for row in dev_data.itertuples()]

for i, sample in enumerate(word_counts_dev):
    for index, count in sample.items():
        X_dev[i][index-1] = count  # Ajustement de l'index par -1

y_pred_dev = clf.predict(X_dev)
print("Rapport de classification pour l'ensemble de développement:")
print(classification_report(y_dev, y_pred_dev))


Rapport de classification pour l'ensemble de développement:
              precision    recall  f1-score   support

          -1       0.58      0.41      0.48       340
           0       0.64      0.69      0.67       739
           1       0.62      0.66      0.64       575

    accuracy                           0.62      1654
   macro avg       0.61      0.59      0.59      1654
weighted avg       0.62      0.62      0.62      1654



# Cellule 9 : Évaluation sur l'ensemble de test


In [10]:
word_counts_test = etape4(test_data, word_to_index)
svm_test_data = convert_to_svm_format(test_data, word_counts_test)

X_test = [[0] * len(word_to_index) for _ in range(len(test_data))]
y_test = [label_mapping[row.sentiment] for row in test_data.itertuples()]

for i, sample in enumerate(word_counts_test):
    for index, count in sample.items():
        X_test[i][index-1] = count  # Ajustement de l'index par -1

y_pred_test = clf.predict(X_test)
print("Rapport de classification pour l'ensemble de test:")
print(classification_report(y_test, y_pred_test))


Rapport de classification pour l'ensemble de test:
              precision    recall  f1-score   support

          -1       0.57      0.37      0.45       559
           0       0.61      0.76      0.68      1513
           1       0.70      0.62      0.66      1475

    accuracy                           0.64      3547
   macro avg       0.63      0.58      0.60      3547
weighted avg       0.64      0.64      0.64      3547

