In [1]:
import logging
import pickle

import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
import pandas as pd
from pandas import Series
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
from transformers import pipeline
import spacy

from experimental.experimental_df import ohe_col_48, encoded_data_col_47, new_cols_15

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
text = """Die Digitalisierung hat in den letzten zwei Jahrzehnten nahezu alle Bereiche des gesellschaftlichen Lebens tiefgreifend verändert. Besonders im Bereich der Kommunikation sind diese Veränderungen deutlich spürbar: Während vor dem Jahr 2000 Telefongespräche und persönliche Treffen dominierten, werden heute soziale Netzwerke, Messenger-Dienste und Videokonferenzen genutzt, um sich in Echtzeit über Kontinente hinweg auszutauschen. Diese neuen Kommunikationsmittel haben nicht nur unsere persönlichen Beziehungen beeinflusst, sondern auch die Arbeitswelt revolutioniert. Homeoffice, digitale Projektarbeit und der Zugang zu globalen Talentpools sind nur einige Beispiele dafür, wie moderne Informationstechnologien neue Formen der Zusammenarbeit ermöglichen.

Gleichzeitig wirft die Digitalisierung auch gesellschaftliche Fragen auf. Die ständige Verfügbarkeit digitaler Medien kann zu Informationsüberflutung, Konzentrationsproblemen und einer beschleunigten Lebensweise führen. Darüber hinaus stehen Fragen des Datenschutzes, der digitalen Souveränität und der Abhängigkeit von großen Tech-Konzernen im Fokus öffentlicher Debatten. Gerade in Zeiten politischer Unsicherheiten und zunehmender Polarisierung gewinnt die Frage an Bedeutung, wie digitale Räume gestaltet werden müssen, um demokratische Diskurse zu fördern und gesellschaftlichen Zusammenhalt zu sichern.

Auch im Bildungswesen zeigt sich ein tiefgreifender Wandel: Digitale Lernplattformen, virtuelle Klassenzimmer und KI-gestützte Lernsysteme bieten neue Möglichkeiten, aber stellen Lehrende und Lernende auch vor neue Herausforderungen. Neben der technischen Infrastruktur sind didaktische Konzepte und digitale Kompetenzen entscheidend dafür, ob diese Entwicklungen tatsächlich zu mehr Bildungsgerechtigkeit und besseren Lernerfolgen führen. Während manche Schüler:innen von den neuen Möglichkeiten profitieren, besteht gleichzeitig die Gefahr, dass bestehende Ungleichheiten durch fehlenden Zugang zu Geräten oder Unterstützung noch verstärkt werden.

Letztlich ist die Digitalisierung kein Selbstzweck, sondern ein gesellschaftlicher Gestaltungsprozess. Es liegt an Politik, Wirtschaft, Wissenschaft und Zivilgesellschaft, diesen Prozess aktiv mitzugestalten – damit die digitale Transformation nicht nur effizient, sondern auch gerecht, inklusiv und nachhaltig erfolgt."""

In [4]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(name)s: %(message)s'
)

logger = logging.getLogger(__name__)



In [8]:
if torch.cuda.is_available():
    cuda_name = torch.cuda.get_device_name()
    logger.info(f"Cuda '{cuda_name}' is avalaible Yeeah 😀")
else:
    logger.error("Cuda is not found 🤕")

2025-07-07 16:23:13,332 [INFO] __main__: Cuda 'NVIDIA GeForce RTX 3060 Ti' is avalaible Yeeah 😀


In [13]:
summarizer = pipeline(task="summarization",
                      model="facebook/bart-large-cnn",
                      device=0)

output = summarizer(text, max_length=450, do_sample=False)[0]["summary_text"]

output

Device set to use cuda:0


'Die Digitalisierung hat in den letzten zwei Jahrzehnten nahezu alle Bereiche des gesellschaftlichen Lebens tiefgreifend verändert. Diese neuen Kommunikationsmittel haben nicht nur unsere persönlichen Beziehungen beeinflusst, sondern auch die Arbeitswelt revolutioniert.'

In [18]:
sentiment_analyzer = pipeline(task="sentiment-analysis",
                                  model="cardiffnlp/twitter-roberta-base-sentiment",
                                  device=0)

output_2 = sentiment_analyzer("Missing")
output_2

Device set to use cuda:0


[{'label': 'LABEL_1', 'score': 0.5228821635246277}]

In [33]:
ser = pd.read_pickle("data_rdy_to_concat/Why_or_why_not_sentiment.pkl")

df = pd.DataFrame(ser.value_counts(dropna=False)).reset_index()

In [34]:
mapping_func = {
    0: "Negativ",
    1: "Neutral",
    2: "Positiv",
}

col = df.columns[0]
df[col] = df[col].map(mapping_func)

In [None]:
import seaborn as sns

plt.rcParams.update({
    "figure.dpi": 300,
    "savefig.dpi": 300,
    "font.size": 11,
    "axes.titlesize": 12,
    "axes.labelsize": 11,
    "xtick.labelsize": 10,
    "ytick.labelsize": 10,
    "legend.fontsize": 10,
    "legend.frameon": False,
    "axes.spines.top": False,
    "axes.spines.right": False,
})

sns.barplot(df, x=df[col], y=df["count"])
plt.xlabel(None)
plt.ylabel("Anzahl")
plt.title("Why or why not? - Nach Sentiment Analysis")

plt.savefig("../Plots/Why_or_why_not_Sentiment.png")

In [53]:
feature_extractor = pipeline(
        task="feature-extraction",
        model="sentence-transformers/all-MiniLM-L6-v2",
        device=0,
        use_fast=True
    )

ser = pd.read_pickle(r"C:\Users\achim\Desktop\OSMI_Mental_Health\data\edited\df_alle_OHE_und_OE_Fragen.pkl")["Why or why not?"]
text_list = ser.to_list()

embeddings = feature_extractor(text_list)

Device set to use cuda:0


ValueError: text input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples).

In [51]:
np.array(embeddings[0][0]).size

384

In [4]:
df = pd.read_pickle("embedded_df.pkl")

In [5]:
from umap import UMAP

umap = UMAP(n_components=20,
            n_neighbors=15,
            min_dist=0.01,
            metric="cosine",
            random_state=42
            )

embeddings_reduced = umap.fit_transform(df)

  warn(


In [7]:
pd.DataFrame(embeddings_reduced).tail(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
1001,-5.885331,-2.458134,12.498726,8.452083,0.624113,-0.342873,-0.810754,8.22349,5.138295,8.126019,5.520603,7.115495,5.418382,5.234456,3.527302,1.508443,2.011702,4.608512,6.241548,5.004623
1002,-5.334691,-2.780894,11.892725,8.558484,0.631256,-0.035866,-0.622096,8.146992,5.160256,8.048045,5.450404,7.051339,5.251226,4.585941,3.45099,1.763914,1.706929,5.108117,6.294127,4.805223
1003,-6.35635,-2.99346,12.046243,8.229743,0.235586,-0.869686,-0.69079,8.030947,5.549613,8.253297,4.817998,6.778785,5.245746,5.151895,3.90998,1.877777,1.164922,4.796104,5.836863,4.871923
1004,-5.657842,-2.359765,12.521283,8.498944,0.718241,-0.225443,-0.835094,8.235729,5.036784,8.066155,5.656741,7.158628,5.447667,5.193591,3.472574,1.4249,2.17101,4.586699,6.323878,5.010507
1005,-5.94864,-2.104896,12.503705,8.273068,0.675605,-0.473251,-0.353901,8.125721,5.462797,8.162793,5.356654,6.943123,5.237601,5.104367,3.825642,1.958257,1.588086,4.635153,5.851623,4.981418
1006,20.782804,4.273787,5.862801,7.149343,5.13135,1.303405,4.79576,6.516902,3.925598,5.865599,7.565609,5.404445,7.184486,3.540267,5.474432,3.280584,1.963402,6.156015,3.546131,6.142787
1007,20.821882,4.20806,5.532084,7.105782,5.02122,1.45736,4.771316,6.551763,4.046038,6.202112,7.65416,5.527157,7.166305,3.333836,5.377075,3.415147,1.797114,5.919675,3.668403,6.3887
1008,-5.85812,-3.011234,11.750479,8.174703,0.251932,-0.666926,-0.737224,8.348554,5.17331,8.196588,5.280232,6.89742,5.685299,5.323799,3.802688,1.75001,1.488292,4.842981,6.130026,5.132706
1009,-6.151248,-0.483529,13.016492,8.322101,1.479982,0.078024,0.541187,7.874074,6.048054,7.70396,5.903625,6.967148,4.743035,4.989789,3.845724,2.193888,1.925696,4.338795,5.185927,5.021033
1010,-4.851116,-0.018781,12.000163,7.909996,1.548786,-0.232445,1.453264,8.042638,6.343998,7.217369,6.526805,6.539337,5.460822,5.306953,4.524753,2.787671,1.541703,4.1263,4.647806,5.459696


In [63]:
from sklearn.manifold import trustworthiness

n_components_list = [2, 5, 10, 20, 50]
trust_scores = []

for n in n_components_list:
    umap = UMAP(n_components=n,
            n_neighbors=15,
            min_dist=0.01,
            metric="cosine",
            random_state=42
            )

    embeddings_reduced = umap.fit_transform(df)

    score = trustworthiness(df, embeddings_reduced, n_neighbors=5, metric="cosine")
    trust_scores.append(score)

optimal_n = n_components_list[np.argmax(trust_scores)]

  warn(
  warn(
  warn(
  warn(
  warn(


In [None]:
import seaborn as sns

sns.lineplot(x=n_components_list, y=trust_scores, linewidth=2)
plt.plot(10, trust_scores[2], marker="x", color="red", markersize=10, markeredgewidth=1.5, label="Optimaler Wert")
plt.plot(5, trust_scores[1], marker="x", color="green", markersize=10, markeredgewidth=1.5, label="Ausreichender Wert")
plt.xlabel("n_components")
plt.ylabel("Trust-Scores")
plt.legend()
plt.title("Trust-Scores nach UMAP-Reduktion")

plt.savefig("../Plots/Trust_Scores_UMAP_Why_or_why_not.png", dpi=300)

In [27]:
from kneed import KneeLocator

k1 = KneeLocator(n_components_list, trust_scores, curve="concave", direction="increasing")
optimal_n_knee = k1.elbow

In [38]:
df = pd.read_pickle("reduced_df.pkl")

In [None]:
# KMeans-Hyperparameter-Testing
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

X = df.to_numpy()

n_clusters_list = list(range(2, 20))

scores = []
for n in n_clusters_list:
    kmeans = KMeans(n_clusters=n, random_state=42).fit(X)

    score = silhouette_score(X, kmeans.labels_, metric="cosine")
    scores.append(score)

sns.set_theme()
plt.plot(n_clusters_list, scores, 'bo-', linewidth=2, markersize=8)
plt.plot(2, scores[0], marker="x", color="red", markersize=10, markeredgewidth=1.5, label="Bester Wert (k=2)")
plt.xticks(n_clusters_list)
plt.xlabel("Anzahl der Cluster (k)")
plt.ylabel("Silhouetten-Score")
plt.title("Optimales k basierend auf dem Silhouetten-Score", fontweight='bold')
plt.legend()
plt.grid(True)
plt.show()
# plt.savefig("../Plots/kmeans_clustering_embeddings_silhoutte.png", dpi=300)

In [48]:
labels = kmeans.labels_

In [49]:
labels

array([13,  9, 16, ..., 18, 11, 10], shape=(1011,), dtype=int32)

In [56]:
tt = pd.read_pickle("data_rdy_to_concat/Why_or_why_not1_sentiment.pkl")

In [58]:
tt.value_counts(dropna=False)

Why or why not?.1
1    509
0    492
2     10
Name: count, dtype: int64

In [59]:
type(tt)

pandas.core.series.Series

In [60]:
aa = pd.read_pickle("data_rdy_to_concat/Why_or_why_not_1_cluster_labels.pkl")

In [61]:
aa.value_counts(dropna=False)

Text Cluster
0    818
1    193
Name: count, dtype: int64

In [95]:
col = "If yes, what condition(s) have you been diagnosed with?"
ser = pd.read_pickle(r"C:\Users\achim\Desktop\OSMI_Mental_Health\data\edited\df_alle_OHE_und_OE_Fragen.pkl")[col]

In [96]:
df.value_counts(dropna=False)

If yes, what condition(s) have you been diagnosed with?
other                                                      632
Anxiety Disorder                                           258
Mood Disorder                                              122
Name: count, dtype: int64

In [77]:
ser = ser.apply(lambda x: "Anxiety Disorder" if "anxiety disorder" in str(x).lower() else "Mood Disorder" if "mood disorder" in str(x).lower() else "Attention Deficit" if "attention deficit" in str(x).lower() else "other")

In [80]:
ser.value_counts(dropna=False)

If yes, what condition(s) have you been diagnosed with?
other                614
Anxiety Disorder     258
Mood Disorder        122
Attention Deficit     18
Name: count, dtype: int64

In [83]:
ser = ser.replace("Attention Deficit", "other")

In [84]:
ser.value_counts(dropna=False)

If yes, what condition(s) have you been diagnosed with?
other               632
Anxiety Disorder    258
Mood Disorder       122
Name: count, dtype: int64

In [86]:
df = pd.DataFrame(ser, index=ser.index)

In [88]:
col_53 = "If yes, what condition(s) have you been diagnosed with?"

In [97]:
from sklearn.preprocessing import OneHotEncoder

ohe_col_53 = OneHotEncoder(sparse_output=False)
encoded_data_col_53 = ohe_col_53.fit_transform(df)

categories = ohe_col_53.categories_
new_cols_53 = [f"{col_53}_{cat}" for cat in categories]

col_53_df = pd.DataFrame(encoded_data_col_53, columns=new_cols_53, index=df.index)

ValueError: Shape of passed values is (1012, 3), indices imply (1012, 1)

In [90]:
col = "If yes, what condition(s) have you been diagnosed with?"
df1 = pd.read_pickle(r"C:\Users\achim\Desktop\OSMI_Mental_Health\data\edited\df_alle_OHE_und_OE_Fragen.pkl")

df1 = df1.drop(columns=[col])
df1 = pd.concat([df1, col_53_df], axis=1)

In [91]:
df1.index

Index([                  1,                   2,                   4,
                         5,                   6,                   7,
                         8,                  10,                  11,
                        12,
       ...
                      1419,                1421,                1422,
                      1424,                1425,                1426,
                      1427,                1430,                1431,
       'Attention Deficit'],
      dtype='object', length=1012)

In [92]:
df1[col]

KeyError: 'If yes, what condition(s) have you been diagnosed with?'

In [93]:
df1[new_cols_53]

Unnamed: 0,"If yes, what condition(s) have you been diagnosed with?_['Anxiety Disorder' 'Mood Disorder' 'other']"
1,<Compressed Sparse Row sparse matrix of dtype ...
2,<Compressed Sparse Row sparse matrix of dtype ...
4,<Compressed Sparse Row sparse matrix of dtype ...
5,<Compressed Sparse Row sparse matrix of dtype ...
6,<Compressed Sparse Row sparse matrix of dtype ...
...,...
1426,<Compressed Sparse Row sparse matrix of dtype ...
1427,<Compressed Sparse Row sparse matrix of dtype ...
1430,<Compressed Sparse Row sparse matrix of dtype ...
1431,<Compressed Sparse Row sparse matrix of dtype ...


In [98]:
col_53 = "If yes, what condition(s) have you been diagnosed with?"
df = pd.read_pickle(r"C:\Users\achim\Desktop\OSMI_Mental_Health\data\edited\df_alle_OHE_und_OE_Fragen.pkl")


In [100]:
df[col_53] = df[col_53].apply(lambda x: "Anxiety Disorder" if "anxiety disorder" in str(x).lower() else "Mood Disorder" if "mood disorder" in str(x).lower() else "Attention Deficit" if "attention deficit" in str(x).lower() else "other")

In [101]:
df[col_53].value_counts(dropna=False)

If yes, what condition(s) have you been diagnosed with?
other                613
Anxiety Disorder     258
Mood Disorder        122
Attention Deficit     18
Name: count, dtype: int64

In [102]:
df[col_53] = df[col_53].replace("Attention Deficit", "other")

In [103]:
df[col_53].value_counts(dropna=False)

If yes, what condition(s) have you been diagnosed with?
other               631
Anxiety Disorder    258
Mood Disorder       122
Name: count, dtype: int64

In [104]:
ohe_col_53 = OneHotEncoder(sparse_output=False)
encoded_data_col_53 = ohe_col_53.fit_transform(df[[col_53]])

categories_53 = ohe_col_53.categories_[0]
new_cols_53 = [f"{col_53}_{cat}" for cat in categories_53]

col_53_df = pd.DataFrame(encoded_data_col_53, columns=new_cols_53, index=df.index)
df = df.drop(columns=[col_53])
df = pd.concat([df, col_53_df], axis=1)

In [106]:
df[new_cols_53].tail()

Unnamed: 0,"If yes, what condition(s) have you been diagnosed with?_Anxiety Disorder","If yes, what condition(s) have you been diagnosed with?_Mood Disorder","If yes, what condition(s) have you been diagnosed with?_other"
1425,0.0,0.0,1.0
1426,0.0,0.0,1.0
1427,1.0,0.0,0.0
1430,0.0,0.0,1.0
1431,1.0,0.0,0.0


In [107]:
df.shape

(1011, 85)

In [108]:
col_54 = "If maybe, what condition(s) do you believe you have?"

In [112]:
df = df.drop(columns=[col_54])

In [113]:
col_55 = "If so, what condition(s) were you diagnosed with?"

In [114]:
df[col_55].value_counts(dropna=False)

If so, what condition(s) were you diagnosed with?
<NA>                                                                                                                                                                                                                                                                                             477
Mood Disorder (Depression, Bipolar Disorder, etc)                                                                                                                                                                                                                                                139
Anxiety Disorder (Generalized, Social, Phobia, etc)|Mood Disorder (Depression, Bipolar Disorder, etc)                                                                                                                                                                                            124
Anxiety Disorder (Generalized, Social, Phobia, etc)                    

In [115]:
df[col_55] = df[col_55].apply(lambda x: "Anxiety Disorder" if "anxiety disorder" in str(x).lower() else "Mood Disorder" if "mood disorder" in str(x).lower() else "Attention Deficit" if "attention deficit" in str(x).lower() else "other")

In [116]:
df[col_55].value_counts(dropna=False)

If so, what condition(s) were you diagnosed with?
other                507
Anxiety Disorder     283
Mood Disorder        196
Attention Deficit     25
Name: count, dtype: int64

In [117]:
df[col_55] = df[col_55].replace("Attention Deficit", "other")

In [118]:
ohe_col_55 = OneHotEncoder(sparse_output=False)
encoded_data_col_55 = ohe_col_55.fit_transform(df[[col_55]])

categories_55 = ohe_col_55.categories_[0]
new_cols_55 = [f"{col_55}_{cat}" for cat in categories_55]

col_55_df = pd.DataFrame(encoded_data_col_55, columns=new_cols_55, index=df.index)
df = df.drop(columns=[col_55])
df = pd.concat([df, col_55_df], axis=1)

In [119]:
col_56 = "What is your age?"

In [120]:
df[col_56].value_counts(dropna=False)

What is your age?
30     68
32     59
29     57
31     57
33     56
35     56
28     56
26     51
34     50
27     45
36     43
37     40
39     38
38     36
40     29
24     24
25     23
44     21
45     21
43     20
42     19
22     18
41     18
46     14
23     13
47     12
21      9
49      8
52      6
50      4
51      4
54      4
48      4
57      3
55      3
53      3
56      2
59      2
19      2
63      2
17      1
20      1
61      1
58      1
323     1
99      1
66      1
3       1
65      1
74      1
70      1
Name: count, dtype: Int64

In [123]:
# Unwahrscheinliche Alterswerte bereinigen (durch den Modus ersetzen)
df.loc[df[col_56] < 15, col_56] = df[col_56].mode().iloc[0]
df.loc[df[col_56] > 90, col_56] = df[col_56].mode().iloc[0]

In [125]:
df[col_56].min()

np.int64(17)

In [126]:
# Binning für Alterspalte
from sklearn.preprocessing import KBinsDiscretizer

# encode="ordinal": KBinsDiscretizer sortiert von jung nach alt
kbin = KBinsDiscretizer(n_bins=4, encode="ordinal", strategy="uniform")

df[col_56] = kbin.fit_transform(df[[col_56]])

In [127]:
df[col_56].value_counts(dropna=False)

What is your age?
1.0    506
0.0    428
2.0     70
3.0      7
Name: count, dtype: int64

In [130]:
Bin_Schritte = (74 - 17) / 4

In [133]:
# immer das End-Alter je Bin
mapping_bins_alter = {
    0: 17 + Bin_Schritte,
    1: 17 + 2 * Bin_Schritte,
    2: 17 + 3 * Bin_Schritte,
    3: 17 + 4 * Bin_Schritte
}

In [135]:
col_57 = "What is your gender?"

In [136]:
df[col_57].value_counts(dropna=False)

What is your gender?
Male                                                       436
male                                                       166
Female                                                     111
female                                                      79
M                                                           66
m                                                           45
F                                                           24
f                                                           16
Female                                                       6
woman                                                        4
Male                                                         4
<NA>                                                         3
Woman                                                        2
Male (cis)                                                   2
Agender                                                      2
non-binary                        

In [140]:
df[col_57] = pd.read_pickle(r"C:\Users\achim\Desktop\OSMI_Mental_Health\data\edited\df_alle_OHE_und_OE_Fragen.pkl")[col_57]

In [141]:
df[col_57].value_counts()

What is your gender?
Male                                                       436
male                                                       166
Female                                                     111
female                                                      79
M                                                           66
m                                                           45
F                                                           24
f                                                           16
Female                                                       6
Male                                                         4
woman                                                        4
Woman                                                        2
Male (cis)                                                   2
Agender                                                      2
non-binary                                                   2
Female assigned at birth          

In [142]:
import re
def gender_regex(x):
    x = str(x).lower().strip()
    if re.search(r'\b(male|m|man)\b', x):
        return 'm'
    elif re.search(r'\b(female|f|woman)\b', x):
        return 'f'
    else:
        return 'other'

In [143]:
df[col_57] = df[col_57].apply(gender_regex)

In [144]:
df[col_57].value_counts(dropna=False)

What is your gender?
m        730
f        254
other     27
Name: count, dtype: int64

In [145]:
ohe_col_57 = OneHotEncoder(sparse_output=False)
encoded_data_col_57 = ohe_col_57.fit_transform(df[[col_57]])

categories_57 = ohe_col_57.categories_[0]
new_cols_57 = [f"{col_57}_{cat}" for cat in categories_57]

col_57_df = pd.DataFrame(encoded_data_col_57, columns=new_cols_57, index=df.index)
df = df.drop(columns=[col_57])
df = pd.concat([df, col_57_df], axis=1)


In [146]:
col_58 = "What country do you live in?"
df[col_58].value_counts(dropna=False)

What country do you live in?
United States of America    661
United Kingdom              110
Canada                       56
Germany                      28
Netherlands                  23
Australia                    23
Sweden                       11
Brazil                        9
France                        9
Ireland                       9
New Zealand                   6
Denmark                       5
Switzerland                   5
Finland                       5
South Africa                  4
Bulgaria                      4
Russia                        4
Norway                        3
Pakistan                      3
India                         3
Spain                         3
Chile                         3
Afghanistan                   2
Colombia                      2
Italy                         2
Israel                        2
Estonia                       2
Bangladesh                    1
Argentina                     1
Mexico                        1
Vietnam    

In [147]:
countries = ["United States of America", "United Kingdom", "Canada"]
df[col_58] = df[col_58].where(df[col_58].isin(countries), "other")

In [148]:
df[col_58].value_counts(dropna=False)

What country do you live in?
United States of America    661
other                       184
United Kingdom              110
Canada                       56
Name: count, dtype: Int64

In [149]:
ohe_col_58 = OneHotEncoder(sparse_output=False)
encoded_data_col_58 = ohe_col_58.fit_transform(df[[col_58]])

categories_58 = ohe_col_58.categories_[0]
new_cols_58 = [f"{col_58}_{cat}" for cat in categories_58]

col_58_df = pd.DataFrame(encoded_data_col_58, columns=new_cols_58, index=df.index)
df = df.drop(columns=[col_58])
df = pd.concat([df, col_58_df], axis=1)


In [151]:
col_59 = "What country do you work in?"
countries = ["United States of America", "United Kingdom", "Canada"]
df[col_59] = df[col_59].where(df[col_59].isin(countries), "other")

In [153]:
df[col_59].value_counts()


What country do you work in?
United States of America    666
other                       182
United Kingdom              109
Canada                       54
Name: count, dtype: Int64

In [154]:
# col_59 wird gelöscht, da identisch zu col_58
df = df.drop(columns=[col_59])

In [155]:
col_60 = "What US state or territory do you live in?"

In [156]:
df[col_60].value_counts(dropna=False)

What US state or territory do you live in?
<NA>                    350
California               99
Illinois                 42
Michigan                 38
Minnesota                36
Texas                    35
New York                 32
Washington               30
Oregon                   28
Tennessee                27
Pennsylvania             26
Colorado                 21
Ohio                     20
Indiana                  19
Florida                  18
Massachusetts            18
North Carolina           17
Virginia                 13
Maryland                 13
Kansas                   12
Oklahoma                 12
Georgia                  11
Nebraska                 10
Wisconsin                 8
Missouri                  7
Utah                      6
Iowa                      5
Connecticut               5
New Hampshire             5
Maine                     5
New Jersey                4
South Dakota              4
Arizona                   4
Alabama                   3
Rhode

In [157]:
states = ["California", "Illinois", "Michigan", "Minnesota", "Texas", "New York", "Washington", "Oregon", "Tennessee", "Pennsylvania"]
df[col_60] = df[col_60].where(df[col_60].isin(states), "other")

In [158]:
df[col_60].value_counts(dropna=False)

What US state or territory do you live in?
other           618
California       99
Illinois         42
Michigan         38
Minnesota        36
Texas            35
New York         32
Washington       30
Oregon           28
Tennessee        27
Pennsylvania     26
Name: count, dtype: Int64

In [159]:
# col_60 wird gedroppt, wird jedoch später noch gesondert analysiert
df = df.drop(columns=[col_60])

In [160]:
# col_61 genauso
col_61 = "What US state or territory do you work in?"
df = df.drop(columns=[col_61])

In [161]:
col_62 = "Which of the following best describes your work position?"
df[col_62].value_counts(dropna=False)

Which of the following best describes your work position?
Back-end Developer                                                                                        196
Front-end Developer                                                                                       101
Other                                                                                                      90
Supervisor/Team Lead                                                                                       53
Back-end Developer|Front-end Developer                                                                     48
                                                                                                         ... 
Dev Evangelist/Advocate|DevOps/SysAdmin|Support|Back-end Developer|Front-end Developer|One-person shop      1
Executive Leadership|Supervisor/Team Lead|Front-end Developer                                               1
Other|Supervisor/Team Lead|Dev Evangelist/Advocate            

In [162]:
def workposition_regex(x):
    x = str(x).strip().lower()
    if re.search(r"front[-\s]?end", x):
        return "front"
    if re.search(r"back[-\s]?end", x):
        return "back"
    if re.search(r"lead", x):
        return "lead"
    if re.search(r"dev", x):
        return "DevOps"
    if re.search(r"support", x):
        return "support"
    else:
        return "other"


In [163]:
df[col_62] = df[col_62].apply(workposition_regex)

In [164]:
df[col_62].value_counts(dropna=False)

Which of the following best describes your work position?
front      346
back       303
other      123
lead       119
DevOps      86
support     34
Name: count, dtype: int64

In [165]:
ohe_col_62 = OneHotEncoder(sparse_output=False)
encoded_data_col_62 = ohe_col_62.fit_transform(df[[col_62]])

categories_62 = ohe_col_62.categories_[0]
new_cols_62 = [f"{col_62}_{cat}" for cat in categories_62]

col_62_df = pd.DataFrame(encoded_data_col_62, columns=new_cols_62, index=df.index)
df = df.drop(columns=[col_62])
df = pd.concat([df, col_62_df], axis=1)


In [166]:
df.index

Index([   1,    2,    4,    5,    6,    7,    8,   10,   11,   12,
       ...
       1418, 1419, 1421, 1422, 1424, 1425, 1426, 1427, 1430, 1431],
      dtype='int64', length=1011)

In [168]:
pd.to_pickle(df, "../data/edited/df_bis_auf_NLP_Spalten_rdy.pkl")

In [169]:
df = pd.read_pickle(r"C:\Users\achim\Desktop\OSMI_Mental_Health\data\edited\full_df_rdy.pkl")

In [170]:
df.shape

(1011, 97)

In [175]:
df = df.drop(columns=["Why or why not?", "Why or why not?.1"])

In [178]:
df["Text Cluster"].value_counts()

ValueError: Grouper for 'Text Cluster' not 1-dimensional

In [None]:
df[""]