In [None]:
import pandas as pd
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.preprocessing import sequence

In [None]:
#1 Load data

In [None]:
path = r'/Users/andersfleck/Desktop/data-science/political-analysis/model/training_data'
all_files = glob.glob(path + "/*.csv")

In [None]:
#2 Store data in Dataframe

In [None]:
li = []

for filename in all_files:
    data = pd.read_csv(filename, index_col=None, header=0)
    li.append(data)
    
df = pd.concat(li, axis=0, ignore_index=True, sort=True)

In [None]:
#3 Clean data (drop eu_code-column; drop 'NaN'; remove '.' & ','; make all lower case)

In [7]:
df = df.drop(['eu_code'],axis=1)
df = df.dropna()
df.content = df['content'].str.replace('[^\w\s]','')
df.content = df['content'].str.lower()

In [8]:
left = [101,103,105,106,107,108,201,202,301,403,404,411,412,414,415,501,502,503,504,602,604,607,701]
right = [102,104,109,110,302,305,401,405,413,505,601,603,605,608,702,703]
to_be_deleted = [000,203,204,303,304,402,406,407,408,409,410,416,506,507,606,704,705,706]

In [9]:
df.cmp_code = df['cmp_code'].replace(to_be_deleted,999)
df = df[df.cmp_code != 999]

In [10]:
df.cmp_code = df['cmp_code'].replace(left,0)
df.cmp_code = df['cmp_code'].replace(right,1)

In [12]:
df

Unnamed: 0,cmp_code,content
3,1.0,wir machen uns für die freiheit des einzelneni...
6,0.0,in der liberalen bürgergesellschaft entscheide...
8,0.0,denn durch sie wird politik gestaltet
9,1.0,sie schöpft ihre kraft aus der eigeninitiative...
11,1.0,wir liberale vertrauen auf die leistungsbereit...
12,1.0,und auf einen staat der seine stärke aus der b...
13,1.0,wir wenden uns an alle mündigen und verantwort...
14,1.0,sie erkennen selbst was getan werden muss und ...
15,1.0,die fdp ist die partei für alle die ihr leben ...
16,0.0,wir wollen die maßstäbe politischen handelns n...


In [None]:
#4 Remove stopwords

In [None]:
fileName = '/Users/andersfleck/Desktop/data-science/political-analysis/model/training_data/stopwords.txt'
stopword_file = open(fileName, 'r')
stopword_data = [line.split(',') for line in stopword_file.readlines()]
stopwords = []

In [None]:
for word_list in stopword_data:
    for word in word_list:
        word.strip()
        stopwords.append(word.strip())

In [None]:
df.content = df['content'].replace(stopwords,'')

In [None]:
#5 Split dataset into training and test set

In [19]:
X = df[['content']]
ylabels = df['cmp_code']
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.25)

In [20]:
X_train.shape

(14598, 1)

In [21]:
#6 Vectorize cleaned text with sklearn (BoW)

In [23]:
c = CountVectorizer()
vec_text = c.fit_transform(X_train['content'].tolist())
#df_vec = pd.DataFrame(vec_text.todense(), columns = list(sorted(c.vocabulary_)))

In [28]:
vec_text.todense().shape

(14598, 22687)

In [None]:
#7 Create pipelines

classifier = LogisticRegression(C=0.5)
model = Pipeline([('vectorizer', c),
                 ('classifier', classifier)
])

In [29]:
classifier = LogisticRegression(C=0.5)

classifier.fit(vec_text,y_train)



LogisticRegression(C=0.5, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
classifier.score(vec_text,y_train)

0.9210165776133717

In [32]:
x_test_vec = c.transform(X_test['content'].tolist())

In [33]:
classifier.score(x_test_vec, y_test)

0.8684751335799424

In [None]:
#Balance the dataset input (left, right)
#Use LSTM to classify
    #Input CountVectorizer()
    #Neural network with embedding layer followed by LSTM layer

In [40]:
new_text = c.transform(['lackadaisical'])

classifier.predict_proba(new_text)

array([[0.87393951, 0.12606049]])

### TEST MODEL WITH KERAS

In [None]:
model = Sequential()
model.add(Embedding(INPUTSIZE, 16, input_length=20))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X_train,y_train, epochs=50,verbose=0)

In [None]:
model.summary()