In [None]:
import nb_black

%load_ext lab_black

In [None]:
import os
from time import time
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.utils.extmath import density
from sklearn import svm
from sklearn import naive_bayes
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.utils import shuffle

# os.chdir('D:\Stock_Market_Sentiment_Analysis-master\Stock_Market_Sentiment_Analysis-master')
np.random.seed(42)


K_Best_Features = 3000

weights = {0:1, 1:3}
def KFold_validation(clf, X, y):
    acc = []
    pos_precision, pos_recall, pos_f1_score = [], [], []
    neg_precision, neg_recall, neg_f1_score = [], [], []

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for train, test in kf.split(X):
        X_train = [X[i] for i in train]
        X_test = [X[i] for i in test]
        y_train = [y[i] for i in train]
        y_test = [y[i] for i in test]

        # vectorizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x : (w for w in x.split(' ') if w.strip()))
        def dummy_fun(doc):
            return doc

        vectorizer = TfidfVectorizer(
            analyzer="word",
            tokenizer=dummy_fun,
            preprocessor=dummy_fun,
            token_pattern=None,
        )

        vectorizer.fit(X_train)
        X_train = vectorizer.transform(X_train)
        X_test = vectorizer.transform(X_test)

        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

        acc.append(metrics.accuracy_score(y_test, preds))
        pos_precision.append(metrics.precision_score(y_test, preds, pos_label=1))
        pos_recall.append(metrics.recall_score(y_test, preds, pos_label=1))
        pos_f1_score.append(metrics.f1_score(y_test, preds, pos_label=1))
        neg_precision.append(metrics.precision_score(y_test, preds, pos_label=0))
        neg_recall.append(metrics.recall_score(y_test, preds, pos_label=0))
        neg_f1_score.append(metrics.f1_score(y_test, preds, pos_label=0))

    return (
        np.mean(acc),
        np.mean(pos_precision),
        np.mean(pos_recall),
        np.mean(pos_f1_score),
        np.mean(neg_precision),
        np.mean(neg_recall),
        np.mean(neg_f1_score),
    )


def benchmark_clfs(X, y):
    print("Loading dataset...")

    classifiers = [
        ("LinearSVC", svm.LinearSVC()),
        ("LogisticReg", LogisticRegression()),
        ("SGD", SGDClassifier()),
        ("MultinomialNB", naive_bayes.MultinomialNB()),
        ("KNN", KNeighborsClassifier()),
        ("DecisionTree", DecisionTreeClassifier()),
        ("RandomForest", RandomForestClassifier()),
        ("AdaBoost", AdaBoostClassifier(base_estimator=LogisticRegression())),
    ]

    cols = [
        "metrics",
        "accuracy",
        "pos_precision",
        "pos_recall",
        "pos_f1_score",
        "neg_precision",
        "neg_recall",
        "neg_f1_score",
    ]
    scores = []
    for name, clf in classifiers:
        score = KFold_validation(clf, X, y)
        row = [name]
        row.extend(score)
        scores.append(row)

    df = pd.DataFrame(scores, columns=cols).T
    df.columns = df.iloc[0]
    df.drop(df.index[[0]], inplace=True)
    df = df.apply(pd.to_numeric, errors="ignore")

    return df


def dummy_fun(doc):
    return doc


def eval_model(X, y):
    print("Loading dataset...")

    clf = svm.LinearSVC()

    vectorizer = TfidfVectorizer(
        analyzer="word", tokenizer=dummy_fun, preprocessor=dummy_fun, token_pattern=None
    )

    X = vectorizer.fit_transform(X)

    print("Train model...")
    clf.fit(X, y,class_weights = {0:1, 1:3})

    print("Loading comments...")
    df = pd.read_csv(comment_file)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    df["created_time"] = pd.to_datetime(df["created_time"], format="%Y-%m-%d %H:%M:%S")
    df["polarity"] = 0
    df["title"].apply(lambda x: [w.strip() for w in x.split()])

    texts = df["title"]
    texts = vectorizer.transform(texts)

    preds = clf.predict(texts)
    df["polarity"] = preds

    df.to_csv("stock_comments_analyzed.csv", index=False)

In [None]:
import tushare as ts

pro = ts.pro_api()

period = 1
id = "000002"


if int(id) > 100000:
    stock_code = id + ".SH"
else:
    stock_code = id + ".SZ"
quotes = pro.daily(ts_code=stock_code, start_date="20080101")
quotes.set_index("trade_date", inplace=True)
pct_chg = quotes["pct_chg"].shift(periods=period, fill_value=0)

In [None]:
 def dummy_fun(doc):
            return doc

vectorizer = TfidfVectorizer(
            analyzer="word",
            tokenizer=dummy_fun,
            preprocessor=dummy_fun,
            token_pattern=None,
        )
thre = np.mean(merged["pct_chg"]) + 0.5*np.std(merged["pct_chg"])

In [None]:
X = merged["title"]
y = merged["pct_chg"] > thre
X_train = [X[i] for i in range(290000,310000)]#X[290000:310000]
y_train =[1*y[i] for i in range(290000,310000)]
X_test = [X[i] for i in range(310000,324832)]
vectorizer.fit(X_train)
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
tokenizer = Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "
)
tokenizer.fit_on_texts(X)

In [None]:
word_embedding = True
if word_embedding:
    print("Embedding...")
    EMBEDDING_FILE = "D:/sgns.financial.bigram-char"
    embed_size = 300

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype="float32")

    embeddings_index = dict(
        get_coefs(*o.rstrip().rsplit(" "))
        for o in open(EMBEDDING_FILE, encoding="ISO-8859-1")
    )

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(vocab) + 1, embed_size))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
tokenizer = Tokenizer(num_words=10000)
max_len = 20
X_train=tokenizer.texts_to_sequences(X[315000:320000])
X_test = tokenizer.texts_to_sequences(X[320000:])
X_train_padded_seqs = pad_sequences(X_train, maxlen=max_len)
X_test_padded_seqs = pad_sequences(X_test, maxlen=max_len)
x_train = tokenizer.sequences_to_matrix(X_train_padded_seqs, mode="binary")
x_test = tokenizer.sequences_to_matrix(X_test_padded_seqs, mode="binary")


In [None]:
tokenizer.sequences_to_matrix(X[315000:315001])

In [None]:
tokenizer.sequences_to_matrix(X_train[0:2])[1][357815]

In [None]:
X_train[0:5]

In [None]:
     

        clf = KNeighborsClassifier()
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)

In [None]:
tokenizer.texts_to_sequences(X[315000])

In [None]:
X[315000]

In [None]:
np.mean(preds)

In [None]:
m=merged.iloc[range(310000,324832)]

In [None]:
mg = m['p'].groupby(m['created_time'])

In [None]:
def BI_Simple_func(row):
    pos = row[row == 1].count()
    neg = row[row == 0].count()

    return (pos-neg)/(pos+neg)

BI_Simple_index = mg.apply(BI_Simple_func)


In [None]:
(BI_Simple_index)[50:200]

In [None]:
d=BI_Simple_index>-0.8
d.index = pd.to_datetime(d.index)

In [None]:
#np.mean(BI_Simple_index>-0.7)
M  = pd.merge(d, pct_chg, how="left", left_index=True, right_index=True)

In [None]:
M['profit'] = M['p']*M['pct_chg']


In [None]:
profit = M['profit'].iloc[0:217]#remove last 2 row

In [None]:
sum(profit)

In [None]:
min(np.cumproduct(1+profit/100))

In [None]:
np.std(profit)

In [None]:
sum(profit)/np.std(profit)

In [None]:
plt.plot(np.cumproduct(1+profit/100))
plt.plot(M['pct_chg'].iloc[0:217])

In [None]:
np.mean(profit==0)

In [None]:
df = pd.read_excel("D:/股吧评论/gu000002.xlsx")
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df["created_time"] = pd.to_datetime(
    df["Date"], format="%Y-%m-%d %H:%M", errors="coerce"
)
df["created_time"] = df["created_time"].dt.strftime("%Y-%m-%d")
df = df[df["title"] != 0]
df.index = pd.to_datetime(df["created_time"])
merged = pd.merge(df, pct_chg, how="right", left_index=True, right_index=True)
merged = merged.dropna()

In [None]:


thre = np.mean(merged["pct_chg"]) + 0.5*np.std(merged["pct_chg"])

X = merged["title"]
y = merged["pct_chg"] > thre
X = X[290000:]
y = y[290000:]
scores = benchmark_clfs(X, y)
print(scores)
scores.to_csv("model_ml_scores1.csv", float_format="%.4f")


# eval_model()

In [None]:
X

In [None]:
import os
import gc
import pickle
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Activation, Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.layers import LSTM, GRU, TimeDistributed, Bidirectional
from keras.layers import BatchNormalization
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

In [None]:
thre = np.mean(merged["pct_chg"]) + 0.5*np.std(merged["pct_chg"])

weights = {0:1, 1:3}
X = merged["title"]
y = merged["pct_chg"] > thre
X = X[310000:]
y = y[310000:]
tokenizer = Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "
)
tokenizer.fit_on_texts(X)
vocab = tokenizer.word_index
print("Vocab size", len(vocab))

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

max_len = 64

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=max_len)
X_test_padded_seqs = pad_sequences(X_test_word_ids, maxlen=max_len)

word_embedding = True
if word_embedding:
    print("Embedding...")
    EMBEDDING_FILE = "D:/sgns.financial.bigram-char"
    embed_size = 300

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype="float32")

    embeddings_index = dict(
        get_coefs(*o.rstrip().rsplit(" "))
        for o in open(EMBEDDING_FILE, encoding="ISO-8859-1")
    )

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(vocab) + 1, embed_size))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        #        print(" — val_f1: % f — val_precision: % f — val_recall % f" % (_val_f1, _val_precision, _val_recall))
        return


def train_model_MLP():
    x_train = tokenizer.sequences_to_matrix(X_train_word_ids, mode="binary")
    x_test = tokenizer.sequences_to_matrix(X_test_word_ids, mode="binary")

    model = Sequential()
    model.add(Dense(128, input_shape=(len(vocab) + 1,), activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    model.summary()

    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    model_checkpoint = ModelCheckpoint("./model-MLP.h5", save_best_only=True)
    metrics = Metrics()
    hist = model.fit(
        x_train,
        y_train,
        batch_size=128,
        epochs=40,
        validation_data=(x_test, y_test),
        callbacks=[metrics, early_stopping, model_checkpoint],
        class_weight=weights,
    )

    best_acc = max(hist.history["val_accuracy"])
    idx = np.argmax(hist.history["val_accuracy"])
    posi_precision = metrics.val_precisions[idx]
    
    recall = metrics.val_recalls[idx]
    f1score = metrics.val_f1s[idx]

    del model, early_stopping, model_checkpoint, metrics
    gc.collect()

    return (best_acc, precision, recall, f1score)


def train_model_LSTM():
    model = Sequential()
    embed_size = 300
    model.add(
        Embedding(
            len(vocab) + 1,
            embed_size,
            weights=[embedding_matrix],
            input_length=max_len,
            trainable=True,
        )
    )
    #    model.add(Embedding(len(vocab)+1, embed_size, input_length=max_len))
    model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.2))
    model.add(Dense(1, activation="sigmoid"))
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    model.summary()

    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    model_checkpoint = ModelCheckpoint("model-LSTM.h5", save_best_only=True)
    metrics = Metrics()
    hist = model.fit(
        X_train_padded_seqs,
        y_train,
        batch_size=128,
        epochs=100,
        validation_data=(X_test_padded_seqs, y_test),
        callbacks=[metrics, early_stopping, model_checkpoint],
        class_weight=weights,
    )

    best_acc = max(hist.history["val_accuracy"])
    idx = np.argmax(hist.history["val_accuracy"])
    precision = metrics.val_precisions[idx]
    recall = metrics.val_recalls[idx]
    f1score = metrics.val_f1s[idx]

    del model, early_stopping, model_checkpoint, metrics
    gc.collect()

    return (best_acc, precision, recall, f1score)


def train_model_TextCNN():
    main_input = Input(shape=(max_len,), dtype="float64")
    embed_size = 300
    embedder = Embedding(len(vocab) + 1, embed_size, input_length=max_len)
    embed = embedder(main_input)
    cnn1 = Convolution1D(256, 3, padding="same", strides=1, activation="relu")(embed)
    cnn1 = MaxPool1D(pool_size=4)(cnn1)
    cnn2 = Convolution1D(256, 4, padding="same", strides=1, activation="relu")(embed)
    cnn2 = MaxPool1D(pool_size=4)(cnn2)
    cnn3 = Convolution1D(256, 5, padding="same", strides=1, activation="relu")(embed)
    cnn3 = MaxPool1D(pool_size=4)(cnn3)
    cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
    flat = Flatten()(cnn)
    drop = Dropout(0.2)(flat)
    main_output = Dense(1, activation="sigmoid")(drop)
    model = Model(inputs=main_input, outputs=main_output)

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    model.summary()

    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    model_checkpoint = ModelCheckpoint("model-TextCNN.h5", save_best_only=True)
    metrics = Metrics()

    hist = model.fit(
        X_train_padded_seqs,
        y_train,
        batch_size=128,
        epochs=20,
        validation_data=(X_test_padded_seqs, y_test),
        callbacks=[early_stopping, metrics, model_checkpoint],
        class_weight=weights,
    )

    best_acc = max(hist.history["val_accuracy"])
    idx = np.argmax(hist.history["val_accuracy"])
    precision = metrics.val_precisions[idx]
    recall = metrics.val_recalls[idx]
    f1score = metrics.val_f1s[idx]

    del model, early_stopping, model_checkpoint, metrics
    gc.collect()

    return (best_acc, precision, recall, f1score)


def eval_models():
    scores = []

    score = ["NN(MLP)"]
    score.extend(train_model_MLP())
    scores.append(score)

    score = ["CNN(TextCNN)"]
    score.extend(train_model_TextCNN())
    scores.append(score)

    score = ["RNN(LSTM)"]
    score.extend(train_model_LSTM())
    scores.append(score)

    df = pd.DataFrame(scores).T
    df.index = ["model", "accuracy", "precision", "recall", "f1score"]
    df.columns = df.iloc[0]
    df.drop(df.index[[0]], inplace=True)
    df = df.apply(pd.to_numeric, errors="ignore")

    return df


if __name__ == "__main__":
    df = eval_models()

    df.to_csv("model_dl_scoresq.csv", float_format="%.4f")

In [None]:
thre = np.mean(merged["pct_chg"]) + 0.5*np.std(merged["pct_chg"])

weights = {0:1, 1:3}
X = merged["title"]
y = merged["pct_chg"] > thre
X = X[310000:]
y = y[310000:]
tokenizer = Tokenizer(
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=" "
)
tokenizer.fit_on_texts(X)
vocab = tokenizer.word_index
print("Vocab size", len(vocab))

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

max_len = 64

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=max_len)
X_test_padded_seqs = pad_sequences(X_test_word_ids, maxlen=max_len)

word_embedding = True
if word_embedding:
    print("Embedding...")
    EMBEDDING_FILE = "D:/sgns.financial.bigram-char"
    embed_size = 300

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype="float32")

    embeddings_index = dict(
        get_coefs(*o.rstrip().rsplit(" "))
        for o in open(EMBEDDING_FILE, encoding="ISO-8859-1")
    )

    word_index = tokenizer.word_index
    embedding_matrix = np.zeros((len(vocab) + 1, embed_size))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


class Metrics(Callback):
    def on_train_begin(self, logs={}):
        self.val_f1s = []
        self.val_recalls = []
        self.val_precisions = []

    def on_epoch_end(self, epoch, logs={}):
        val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
        val_targ = self.validation_data[1]
        _val_f1 = f1_score(val_targ, val_predict)
        _val_recall = recall_score(val_targ, val_predict)
        _val_precision = precision_score(val_targ, val_predict)
        self.val_f1s.append(_val_f1)
        self.val_recalls.append(_val_recall)
        self.val_precisions.append(_val_precision)
        #        print(" — val_f1: % f — val_precision: % f — val_recall % f" % (_val_f1, _val_precision, _val_recall))
        return

In [None]:
   
    x_train = tokenizer.sequences_to_matrix(X_train_word_ids, mode="binary")
    x_test = tokenizer.sequences_to_matrix(X_test_word_ids, mode="binary")

    model = Sequential()
    model.add(Dense(128, input_shape=(len(vocab) + 1,), activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="sigmoid"))

    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    model.summary()

    early_stopping = EarlyStopping(monitor="val_loss", patience=5)
    model_checkpoint = ModelCheckpoint("./model-MLP.h5", save_best_only=True)
    metrics = Metrics()
    hist = model.fit(
        x_train,
        y_train,
        batch_size=128,
        epochs=40,
        validation_data=(x_test, y_test),
        callbacks=[metrics, early_stopping, model_checkpoint],
        class_weight=weights,
    )


In [None]:
df

In [None]:

    best_acc = max(hist.history["val_accuracy"])
    idx = np.argmax(hist.history["val_accuracy"])
    precision = metrics.val_precisions[idx]
    recall = metrics.val_recalls[idx]
    f1score = metrics.val_f1s[idx]
    print([precision,recall,f1score])

In [None]:
model.predict(x_train).shape()


In [None]:
y_test[1:20]


In [None]:
preds[1:20]

In [None]:
hist.history['val_accuracy']

In [None]:
import matplotlib.pyplot as plt
plt.plot(hist.history['accuracy'])
plt.plot(hist.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
merged['y']=merged['pct_chg']>thre

In [None]:
ndf=merged[['title','y']]

In [None]:
ndf.info()

In [None]:
ndf.iloc[0:200000].to_csv('train.txt',sep='\t',index=False,header=0)


In [None]:
ndf.iloc[220000:].to_csv('test.txt',sep='\t',index=False,header=0)
ndf.iloc[200000:220000].to_csv('val.txt',sep='\t',index=False,header=0)

In [None]:
ndf.iloc[150000:200000].to_csv('train2.txt',sep='\t',index=False,header=0)

In [None]:
ndf.iloc[310000:].to_csv('test2.txt',sep='\t',index=False,header=0)