In [1]:
import os
import pandas as pd
import datetime
import json
import re
import tushare as ts
import gensim
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [33]:
import matplotlib as mpl
mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei'] #指定默认字体  
mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
# sns.axes_style()，可以看到是否成功设定字体为微软雅黑。

In [2]:
sns.set_style("whitegrid")
sns.set_context("talk")

In [90]:
data_dir = "C:/Users/jgtzsx01/Documents/workspace/data/sina"
csv_dir = "%s/day-news"%(data_dir)
key_words_dir = "%s/week-keywords"%(data_dir)
word_count_file = "%s/word_count.json"%(data_dir)
asset_class_dir = "C:/Users/jgtzsx01/Documents/workspace/data/asset-class"

In [3]:
model = gensim.models.Word2Vec.load("C:/Users/jgtzsx01/Documents/workspace/model/model")

In [4]:
len(model.vocab)

11867

In [5]:
# 判断是否是中文
def isChinese(word):
    for w in word:
        if re.match('[ \u4e00 -\u9fa5]+',w) == None:
            continue
        else:
            return False
    return True

In [36]:
with open(word_count_file, 'r') as f:
    word_cnt = json.load(f)

In [47]:
total_word_count = {}
for word in word_cnt:
    for day, value in word_cnt[word].iteritems():
        if not total_word_count.has_key(day):
            total_word_count[day] = 0
        total_word_count[day] += value * 1.0 / 10

In [192]:
key_word = u"债券"
save_name = "bond"
for w, v in model.most_similar(key_word, topn=15):
    if isChinese(w):
        print w, v

债务 0.999874591827
地方 0.999865651131
项目 0.999859809875
政府 0.999857366085
公司 0.999856472015
工作 0.999856472015
问题 0.999852180481
企业 0.999851882458
集团 0.999850571156
行业 0.999848902225
市场 0.999848604202
记者 0.999846041203
今年 0.999845504761
影响 0.999844908714
规模 0.999842941761


In [193]:
word_df = pd.DataFrame({"date": word_cnt[key_word].keys(), "count": word_cnt[key_word].values()})

In [194]:
word_df.index = word_df["date"].map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
word_df.sort_index(inplace=True)

In [195]:
word_df

Unnamed: 0_level_0,count,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-12,10,2017-01-12
2017-01-13,85,2017-01-13
2017-01-14,8,2017-01-14
2017-01-15,20,2017-01-15
2017-01-16,56,2017-01-16
2017-01-17,36,2017-01-17
2017-01-18,95,2017-01-18
2017-01-19,58,2017-01-19
2017-01-20,84,2017-01-20
2017-01-21,60,2017-01-21


In [196]:
hot_val = {}
threshold = 1
for word in word_cnt:
    if isChinese(word) and word in model.vocab:
        dis = model.similarity(word, key_word)
        if dis >= threshold:
            for day, value in word_cnt[word].iteritems():
                if not hot_val.has_key(day):
                    hot_val[day] = 0
                hot_val[day] += value * dis
for key, value in hot_val.iteritems():
    hot_val[key] = value / total_word_count[key]

In [197]:
hot_df = pd.DataFrame({"date": hot_val.keys(), "value": hot_val.values()})

In [198]:
hot_df["value"] = hot_df["value"] * 100000

In [199]:
hot_df.index = hot_df["date"].map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d"))
hot_df.sort_index(inplace=True)

In [200]:
hot_df

Unnamed: 0_level_0,date,value
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-12,2017-01-12,165.931039
2017-01-13,2017-01-13,300.133824
2017-01-14,2017-01-14,69.107903
2017-01-15,2017-01-15,190.550596
2017-01-16,2017-01-16,196.414041
2017-01-17,2017-01-17,145.341209
2017-01-18,2017-01-18,382.560092
2017-01-19,2017-01-19,254.528858
2017-01-20,2017-01-20,352.328303
2017-01-21,2017-01-21,547.175662


In [201]:
hot_df["date"] = hot_df["date"].map(lambda x: datetime.datetime.strptime(x, "%Y-%m-%d").strftime("%Y%m%d"))

In [202]:
hot_df.to_csv("%s/%s.csv"%(asset_class_dir, save_name), index=False)

In [160]:
df = ts.get_hist_data("sh")

In [162]:
df.index = pd.to_datetime(df.index)

In [163]:
df = df[df.index >= datetime.datetime(2017, 1, 12)]

In [165]:
df = df[["close"]]

In [166]:
df.columns = ["value"]

In [167]:
df["date"] = df.index.map(lambda x: x.strftime("%Y%m%d"))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [168]:
df

Unnamed: 0_level_0,value,date
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-01-24,3142.88,20170124
2017-01-23,3136.77,20170123
2017-01-20,3123.14,20170120
2017-01-19,3101.3,20170119
2017-01-18,3113.01,20170118
2017-01-17,3108.77,20170117
2017-01-16,3103.43,20170116
2017-01-13,3112.76,20170113
2017-01-12,3119.29,20170112


In [169]:
df.to_csv("%s/stock.csv"%(asset_class_dir), index=False)