In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from libs.SyntheticDataGenerator import fill_missing_labels, fill_synthetic_data_percentage
from libs.SentimentClassifier import preprocess, evaluation

import torch_directml

device = torch_directml.device()

data =pd.read_csv(
    filepath_or_buffer='../data/data.csv',
    sep='\t',
    encoding='utf8',
    names=["ID", "Label", "Tweet"]
    )

data = data.sample(100)

train_dataset, val_dataset = train_test_split(data, test_size=0.5, random_state=42)

train_dataloader = preprocess(train_dataset, batch_size=10)
val_dataset = preprocess(val_dataset, batch_size=10)

evaluation(train_dataloader, val_dataset, epochs=2, device=device)

In [None]:
data =pd.read_csv(
    filepath_or_buffer='../data/training.1600000.processed.noemoticon.csv',
    encoding='latin-1',
    names=["Label", "ID", "Timestamp", "Query", "Username", "Tweet"]
    )
data = data.sample(frac=1)
data = data.sample(500000)

In [None]:
# Count the occurrences of each label
label_counts = data['Label'].value_counts()
print(label_counts)
# Set the desired count for each label
desired_count = min(label_counts)
# Filter the DataFrame to have the same count of labels
filtered_df = data.groupby('Label').apply(lambda x: x.sample(desired_count))

# Reset the index of the filtered DataFrame
#filtered_df = filtered_df.reset_index(drop=True)

# Display the filtered DataFrame
filtered_df.head(50)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Annahme: Sie haben einen DataFrame namens "train_data" mit einer Spalte "Label", die die Labels enthält
# und die Werte der Labels sind in Form von Zeichenketten (Strings) gegeben.

# Verwenden Sie die "value_counts()" Funktion, um die Anzahl der Vorkommen jedes Labels zu ermitteln
labels = data['Label']
label_counts = labels.value_counts()

# Erstellen Sie ein Kuchendiagramm der Label-Verteilung mit Seaborn
plt.figure(figsize=(8, 6))  # Festlegen der Diagrammgröße
sns.set(style="darkgrid")  # Festlegen des Stils

# Erstellen des Kuchendiagramms
plt.pie(label_counts, labels=label_counts.index, autopct='%1.1f%%')

plt.title('Verteilung der Labels')

plt.show()


In [None]:
def label_to_number(label):
    label_mapping = {
        'negative' : 0,
        'neutral' : 1,
        'positive' : 2
    }

    return label_mapping[label]


labels = labels.apply(lambda x : label_to_number(x))

In [None]:
import torch
from transformers import AutoTokenizer


# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
MAX_SEQUENCE_LENGTH = 100


# Tokenize and pad the tweets
input_ids = []
attention_masks = []


def tweet_pipeline(tweet):
    encoded_dict = tokenizer.encode_plus(tweet,
                                         add_special_tokens = True,
                                         padding= 'max_length',
                                         return_attention_mask = True,
                                         return_tensors = 'pt')
    input_ids.append(encoded_dict["input_ids"])
    attention_masks.append(encoded_dict["attention_mask"])

tweets = tweets.apply(lambda tweet : tweet_pipeline(tweet))

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [None]:
from libs.SentimentClassifier import k_cross_fold_validation, evaluation
from transformers import logging
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split


seed_val = 42
k = 2
batch_size = 5
epochs = 2
device = "cpu"

logging.set_verbosity_error()
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.4 * len(dataset))
val_size = len(dataset) - train_size


train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset),
            batch_size = batch_size
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset),
            batch_size = batch_size
        )




#k_cross_fold_validation(dataset,k,epochs,batch_size,device)
evaluation(train_dataloader,validation_dataloader,epochs,device)

In [None]:
class_mapping = {
    0: 'neutral',
    1: 'positive',
    2: 'negative'
}

all_cm = [i['Valid. Confusion Matrix'] for i in training_stats]

avrg_confusion_matrix = torch.ceil(torch.mean(torch.stack(all_cm) , dim=0))

plt.figure(figsize=(15,10))

class_names = list(class_mapping.values())
df_cm = pd.DataFrame(avrg_confusion_matrix, index=class_names, columns=class_names).astype(int)
heatmap = sns.heatmap(df_cm, annot=True, fmt="d")

heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right',fontsize=15)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right',fontsize=15)
plt.ylabel('True label')
plt.xlabel('Predicted label')
