In [1]:
import re
import pandas as pd
import csv
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import SnowballStemmer
import nltk
import spacy
from collections import Counter
from statistics import mean
nlp = spacy.load('de_core_news_sm')
snowStemmer = SnowballStemmer(language='german')
RAW_DATA_PATH = 'data/raw/'

Helper Functions

In [2]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [3]:
def find_longest_word(word_list):
    longest_word =  max(word_list, key=len)
    return len(longest_word)

In [4]:
def lemmatize_string(input_text):
    doc = nlp(input_text.lower())
    result = ' '.join([x.lemma_ for x in doc])
    doc = nlp(result.title())
    result = ' '.join([x.lemma_ for x in doc]).upper()
    return result

In [5]:
def stemm_List_of_Words(input_text):
    result = [snowStemmer.stem(word).upper() for word in input_text]
    return result

In [6]:
def contains_song(ID, JAHR, MONAT):
    return  len(df.loc[(df['ID'] == ID) & (df['JAHR'] == JAHR) & (df['MONAT'] == MONAT)]) >= 1

In [7]:
def isSeasonal():
    for index, row in df.iterrows():
        if np.isnan(row['SEASONAL?']) or False:
            df.loc[(df['ID'] == row['ID']), 'SEASONAL?'] = contains_song(row['ID'], row['JAHR'] + 1, row['MONAT'])

Data Extraction

In [8]:
df_Lied = pd.read_csv(RAW_DATA_PATH + 'LIED.csv', usecols=['ID','INTERPRET', 'TITEL', 'SPRACHE_DEUTSCH', 'TEXT_TEIL1', 'TEXT_TEIL2', 'TEXT_TEIL3', 'TEXT_TEIL4'])
#print(df_Lied.head())

df_Chart_Position = pd.read_csv(RAW_DATA_PATH + 'CHART_POSITION.csv', usecols=['LIED_ID', 'POSITION', 'DATUM_VON', 'DATUM_BIS'])
#print(df_Chart_Position.head())

#get stopword-list
with open(RAW_DATA_PATH+'Stoppwords.csv', newline='', encoding='UTF-8') as f:
    stopwords_list = list(csv.reader(f))
stopwords_list = [word.upper() for word in flatten(stopwords_list)]

Data Conversion

In [9]:
df_Lied['TEXT'] = df_Lied['TEXT_TEIL1'].fillna('') + df_Lied['TEXT_TEIL2'].fillna('') + df_Lied['TEXT_TEIL3'].fillna('') + df_Lied['TEXT_TEIL4'].fillna('')

df_Chart_Position['DATUM_VON'] = pd.to_datetime(df_Chart_Position['DATUM_VON'])
df_Chart_Position['DATUM_BIS'] = pd.to_datetime(df_Chart_Position['DATUM_BIS'])
df_Chart_Position['DAUER'] = (df_Chart_Position['DATUM_BIS'] - df_Chart_Position['DATUM_VON']).dt.days.astype('int16')
df_Chart_Position['JAHR'] = df_Chart_Position['DATUM_BIS'].dt.year.astype('int16')
df_Chart_Position['MONAT'] =  df_Chart_Position['DATUM_BIS'].dt.month.astype('int16')

Text Preprocessing

In [10]:
df_Lied['processed_TEXT'] = df_Lied['TEXT']

#lemmatization
#df_Lied['processed_TEXT'] = df_Lied.processed_TEXT.apply(lambda text: lemmatize_string(text))

# tokenize
df_Lied['processed_TEXT'] = df_Lied.processed_TEXT.apply(lambda text: nltk.word_tokenize(text))

#Stemming
df_Lied['processed_TEXT'] = df_Lied.processed_TEXT.apply(lambda text: stemm_List_of_Words(text))

#remove stopwords
df_Lied['processed_TEXT'] = df_Lied.processed_TEXT.apply(lambda x: [item for item in x if item not in stopwords_list])

#remove numbers
df_Lied['processed_TEXT'] = df_Lied.processed_TEXT.apply(lambda word_list : [re.sub('\w*\d\w*','', word) for word in word_list])

KeyboardInterrupt: 

In [None]:
print(df_Lied['processed_TEXT'][0])

In [None]:
print(df_Lied['TEXT'][0])

Data Selection

In [None]:
df_Lied.drop(['TEXT_TEIL1','TEXT_TEIL2', 'TEXT_TEIL3', 'TEXT_TEIL4', 'SPRACHE_DEUTSCH'], axis=1, inplace=True)
df_Date = df_Chart_Position[['LIED_ID', 'DATUM_VON', 'DATUM_BIS', 'DAUER','JAHR', 'MONAT']]

#not used anymore:
#df_Chart_Position = df_Chart_Position.groupby('LIED_ID').agg({'POSITION':'mean','DAUER':'sum'}).reset_index()
#df_Chart_Position['POSITION'] = df_Chart_Position.POSITION.apply(lambda pos: round(pos))
#----

df_Lied.sort_values(by='ID', inplace=True)
df_Chart_Position.sort_values(by='LIED_ID', inplace=True)
df = pd.concat([df_Lied, df_Chart_Position], axis='columns')
df.drop('LIED_ID', axis=1, inplace=True)
print(df.head())

Feature Creation

In [None]:
df['ANZ_UNIQUE_WOERTER'] = list(len(set(word)) for word in df['processed_TEXT'])
df['MAX_WORT_WDH'] = [max(Counter(text).values()) for text in df['processed_TEXT']]

max_word_list = []
for text in df['processed_TEXT']:
    count = dict(Counter(text).items())
    count = {k: v for k, v in sorted(count.items(), key=lambda item: item[1], reverse=True)}
    key_list = [key for key in count.keys()]
    values_list = [key for key in count.values()]
    text_dict = {'WORD': key_list, 'FREQ': values_list}
    max_word = key_list[0]
    max_word_list.append(max_word)
df['WORT_MAX_WDH'] = max_word_list

df['LAENGE_LAENGSTES_WORT'] = list(len(max(set(word), key=len)) for word in df['processed_TEXT'])

In [None]:
df

RANK_SCORES

In [None]:
MAX_RANK = 50
df['RANK_SCORE'] = MAX_RANK - df['POSITION'] + 1
df['MAX_RANK_SCORE'] = [max(df.loc[df.ID == id_, 'RANK_SCORE']) for id_ in df['ID']]
df['MEAN_RANK_SCORE'] = [round(mean(df.loc[df.ID == id_, 'RANK_SCORE'])) for id_ in df['ID']]

Percentage of Stopwords

In [None]:
df['NUMBER_OF_STOPWORDS'] = df.TEXT.str.split().apply(lambda x: len(set(x) & set(stopwords_list)))
df['STOPWORD_PERCENTAGE'] = df.NUMBER_OF_STOPWORDS.apply(lambda row: round(row/len(df['TEXT']), ndigits=5))

In [None]:
df.drop('NUMBER_OF_STOPWORDS', axis=1, inplace=True)

In [None]:
df.reset_index(inplace=True)
df.drop('index',axis=1, inplace=True)
print(df.head())

Title Analysis

In [None]:
#lemmatize title
#df['processed_TITLE'] = df.TITEL.apply(lambda titel: ' '.join([x.lemma_ for x in nlp(titel)]))
#print(df['processed_TITLE'].head())

#tokenize title
tokenizer = RegexpTokenizer(r'\w+')
df['processed_TITLE'] = df.TITEL.apply(lambda titel: [word for word in tokenizer.tokenize(titel)])

#Stemming
df['processed_TITLE'] = df.processed_TITLE.apply(lambda titel: stemm_List_of_Words(titel))

#remove stopwords
df['processed_TITLE'] = df.processed_TITLE.apply(lambda titel: [word for word in titel if word.upper() not in stopwords_list])

#remove numbers
df['processed_TITLE'] = df.processed_TITLE.apply(lambda word_list : [re.sub('\w*\d\w*','NUMBER', word) for word in word_list])

In [None]:
df['LENGTH_TITLE'] = df.TITEL.apply(lambda titel: len(titel))

In [None]:
print(df.head())

In [None]:
title_list = [title for title in df['processed_TITLE']]
count = dict(Counter(flatten(title_list)).items())
count = {k: v for k, v in sorted(count.items(), key=lambda item: item[1], reverse=True)}
key_list = [key for key in count.keys()]
values_list = [key for key in count.values()]
title_dict = {'WORD': key_list, 'FREQ': values_list}
df_Titel = pd.DataFrame(title_dict)
print(df_Titel.head())

Seasonal determination

In [None]:
df['SEASONAL?'] = np.nan
isSeasonal()
df['SEASONAL?']

MULTILINGUAL DETECTION

In [None]:
df['MULTILINGAL?'] = np.nan

Reshape df

In [None]:
new_cols = ['ID', 'INTERPRET', 'TITEL', 'processed_TITLE', 'TEXT', 'processed_TEXT', 'DATUM_VON', 'DATUM_BIS', 'JAHR', 'MONAT', 'DAUER','ANZ_UNIQUE_WOERTER', 'MAX_WORT_WDH', 'WORT_MAX_WDH','LAENGE_LAENGSTES_WORT', 'STOPWORD_PERCENTAGE', 'LENGTH_TITLE', 'SEASONAL?', 'MULTILINGAL?', 'POSITION', 'RANK_SCORE', 'MAX_RANK_SCORE', 'MEAN_RANK_SCORE']
df=df[new_cols]
df=df.reindex(columns=new_cols)
print(np.shape(df))

In [None]:
df.columns

Export

In [38]:
df.to_csv('Data/processed/EDA.csv')
#df_Date.to_csv('Data/processed/DATE.csv')
#df_Titel.to_csv('Data/processed/TITLE-ANALYSIS.csv')