# 時中指數

# Load Data 

In [5]:
import pandas as pd
from datetime import datetime, timedelta

df = pd.read_csv('./now_news_preprocessed.csv',sep='|')

In [6]:
df.head(1)

Unnamed: 0,item_id,date,category,title,content,sentiment,summary,top_key_freq,tokens,tokens_v2,entities,token_pos,link,photo_link
0,column/_20220505_1001,2022-05-05,焦點,配發率32%！富邦金每股配4元股利、現金股利歷史新高,富邦金控今（5）日公告，董事會決議通過擬配發普通股每股現金股利3.5元及股票股利0.5元，合...,0.0,"['富邦金這次同時配發股票股利每股0.5元', '包含富邦人壽上繳155.78億元、台北富邦...","[('股利', 18), ('富邦', 12), ('現金', 9), ('股票', 5),...","['富邦', '金控', '今', '（5', '）', '日', '公告', '，', '...","['富邦', '金控', '董事會', '通過', '普通股', '現金', '股利', '...","[NerToken(word='富邦金控', ner='ORG', idx=(0, 4)),...","[('富邦', 'Nb'), ('金控', 'Na'), ('今', 'Nd'), ('（5...",https://www.nownews.com/news/5796259,https://media.nownews.com/nn_media/thumbnail/2...


In [7]:
df.shape

(3671, 14)

# Filter news for selected keywords

In [8]:
# end date: the date of the last record of news
end_date = df.iloc[-1].date
end_date = df.date.max()

# start date
weeks = 4 # week duration for analysis
start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

In [9]:
df.iloc[-1].date

'2022-06-05'

In [10]:
df.date.max()

'2022-06-08'

In [11]:
keyword = ['柯文哲','文哲']

In [12]:
query_df = df[
    (df['date'] >= start_date) & (df['date'] <= end_date) & df['tokens_v2'].str.contains('|'.join(keyword))]

In [13]:
len(query_df)

92

# Count how many pieces of news containing these keywords計算各類別多少篇文章提到該關鍵字

# Count how many times these keywords were mentioned in each category計算各類別出現關鍵字次數

In [14]:
# **計算各類別多少篇文章提到該關鍵字
# **計算各類別出現關鍵字次數

news_categories = ['焦點', '要聞', '即時', '娛樂', '新奇', '生活', '財經', '專題', '全球', '運動', '全部']


def count_keyword(query_df, keyword):
    cate_occurrence = {}
    cate_freq = {}

    for cate in news_categories:
        cate_occurrence[cate] = 0
        cate_freq[cate] = 0

    for idx, row in query_df.iterrows():
        # count number of news
        cate_occurrence[row.category] += 1
        cate_occurrence['全部'] += 1
        # count keyword frequency
        tokens = eval(row.tokens_v2)
        freq = len([word for word in tokens if (word in keyword)])
        cate_freq[row.category] += freq
        cate_freq['全部'] += freq
    return cate_freq, cate_occurrence

In [15]:
count_keyword(query_df, keyword)

({'焦點': 83,
  '要聞': 148,
  '即時': 25,
  '娛樂': 0,
  '新奇': 0,
  '生活': 98,
  '財經': 0,
  '專題': 0,
  '全球': 1,
  '運動': 14,
  '全部': 369},
 {'焦點': 22,
  '要聞': 38,
  '即時': 10,
  '娛樂': 0,
  '新奇': 0,
  '生活': 19,
  '財經': 0,
  '專題': 0,
  '全球': 1,
  '運動': 2,
  '全部': 92})

# Caclulate date-based reported frequency of these keywords計算被報導的次數以時間為基礎

In [16]:
def get_key_time_freq(query_df, keyword):
    date_samples = query_df.date
    query_freq = pd.DataFrame({'date_index': pd.to_datetime(date_samples), 'freq': [1 for _ in range(len(query_df))]})
    data = query_freq.groupby(pd.Grouper(key='date_index', freq='D')).sum()
    time_data = []
    for i, idx in enumerate(data.index):
        row = {'x': idx.strftime('%Y-%m-%d'), 'y': int(data.iloc[i].freq)}
        time_data.append(row)
    return time_data

In [17]:
get_key_time_freq(query_df, keyword)

[{'x': '2022-05-11', 'y': 3},
 {'x': '2022-05-12', 'y': 6},
 {'x': '2022-05-13', 'y': 0},
 {'x': '2022-05-14', 'y': 7},
 {'x': '2022-05-15', 'y': 0},
 {'x': '2022-05-16', 'y': 0},
 {'x': '2022-05-17', 'y': 0},
 {'x': '2022-05-18', 'y': 0},
 {'x': '2022-05-19', 'y': 0},
 {'x': '2022-05-20', 'y': 0},
 {'x': '2022-05-21', 'y': 0},
 {'x': '2022-05-22', 'y': 0},
 {'x': '2022-05-23', 'y': 5},
 {'x': '2022-05-24', 'y': 12},
 {'x': '2022-05-25', 'y': 6},
 {'x': '2022-05-26', 'y': 0},
 {'x': '2022-05-27', 'y': 0},
 {'x': '2022-05-28', 'y': 0},
 {'x': '2022-05-29', 'y': 0},
 {'x': '2022-05-30', 'y': 7},
 {'x': '2022-05-31', 'y': 6},
 {'x': '2022-06-01', 'y': 6},
 {'x': '2022-06-02', 'y': 10},
 {'x': '2022-06-03', 'y': 10},
 {'x': '2022-06-04', 'y': 0},
 {'x': '2022-06-05', 'y': 1},
 {'x': '2022-06-06', 'y': 0},
 {'x': '2022-06-07', 'y': 8},
 {'x': '2022-06-08', 'y': 5}]

In [18]:
keyword = ['柯文哲','文哲']

end_date = df.date.max()
# start date
weeks = 4 # week duration for analysis
start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

query_df = df[
    (df['date'] >= start_date) & (df['date'] <= end_date) & df['tokens_v2'].str.contains('|'.join(keyword))]
# query_df = df[df['tokens_v2'].str.contains('|'.join(keyword))]
# print(len(query_df))
freqByDate = get_key_time_freq(query_df, keyword)

cate_freq, cate_occurrence = count_keyword(query_df, keyword)

selectedCategories = ['全部', '焦點', '要聞', '即時', '生活']
# selectedCategories = ['政治', '產經', '生活', '國際', '社會', '兩岸','全部']

freqByCate = [cate_occurrence[k] for k in selectedCategories]

response =  {'freqByDate': freqByDate,
           'freqByCate': freqByCate,
           'category': selectedCategories,
           'num_frequency': cate_freq['全部'], # 這關鍵字被提多少次
           'num_occurrence': cate_occurrence['全部'] #多少篇提到這關鍵字
           }

In [19]:
freqByCate

[92, 22, 38, 10, 19]

In [20]:
cate_freq

{'焦點': 83,
 '要聞': 148,
 '即時': 25,
 '娛樂': 0,
 '新奇': 0,
 '生活': 98,
 '財經': 0,
 '專題': 0,
 '全球': 1,
 '運動': 14,
 '全部': 369}

In [21]:
cate_occurrence

{'焦點': 22,
 '要聞': 38,
 '即時': 10,
 '娛樂': 0,
 '新奇': 0,
 '生活': 19,
 '財經': 0,
 '專題': 0,
 '全球': 1,
 '運動': 2,
 '全部': 92}

In [22]:
freqByDate

[{'x': '2022-05-11', 'y': 3},
 {'x': '2022-05-12', 'y': 6},
 {'x': '2022-05-13', 'y': 0},
 {'x': '2022-05-14', 'y': 7},
 {'x': '2022-05-15', 'y': 0},
 {'x': '2022-05-16', 'y': 0},
 {'x': '2022-05-17', 'y': 0},
 {'x': '2022-05-18', 'y': 0},
 {'x': '2022-05-19', 'y': 0},
 {'x': '2022-05-20', 'y': 0},
 {'x': '2022-05-21', 'y': 0},
 {'x': '2022-05-22', 'y': 0},
 {'x': '2022-05-23', 'y': 5},
 {'x': '2022-05-24', 'y': 12},
 {'x': '2022-05-25', 'y': 6},
 {'x': '2022-05-26', 'y': 0},
 {'x': '2022-05-27', 'y': 0},
 {'x': '2022-05-28', 'y': 0},
 {'x': '2022-05-29', 'y': 0},
 {'x': '2022-05-30', 'y': 7},
 {'x': '2022-05-31', 'y': 6},
 {'x': '2022-06-01', 'y': 6},
 {'x': '2022-06-02', 'y': 10},
 {'x': '2022-06-03', 'y': 10},
 {'x': '2022-06-04', 'y': 0},
 {'x': '2022-06-05', 'y': 1},
 {'x': '2022-06-06', 'y': 0},
 {'x': '2022-06-07', 'y': 8},
 {'x': '2022-06-08', 'y': 5}]

In [23]:
response

{'freqByDate': [{'x': '2022-05-11', 'y': 3},
  {'x': '2022-05-12', 'y': 6},
  {'x': '2022-05-13', 'y': 0},
  {'x': '2022-05-14', 'y': 7},
  {'x': '2022-05-15', 'y': 0},
  {'x': '2022-05-16', 'y': 0},
  {'x': '2022-05-17', 'y': 0},
  {'x': '2022-05-18', 'y': 0},
  {'x': '2022-05-19', 'y': 0},
  {'x': '2022-05-20', 'y': 0},
  {'x': '2022-05-21', 'y': 0},
  {'x': '2022-05-22', 'y': 0},
  {'x': '2022-05-23', 'y': 5},
  {'x': '2022-05-24', 'y': 12},
  {'x': '2022-05-25', 'y': 6},
  {'x': '2022-05-26', 'y': 0},
  {'x': '2022-05-27', 'y': 0},
  {'x': '2022-05-28', 'y': 0},
  {'x': '2022-05-29', 'y': 0},
  {'x': '2022-05-30', 'y': 7},
  {'x': '2022-05-31', 'y': 6},
  {'x': '2022-06-01', 'y': 6},
  {'x': '2022-06-02', 'y': 10},
  {'x': '2022-06-03', 'y': 10},
  {'x': '2022-06-04', 'y': 0},
  {'x': '2022-06-05', 'y': 1},
  {'x': '2022-06-06', 'y': 0},
  {'x': '2022-06-07', 'y': 8},
  {'x': '2022-06-08', 'y': 5}],
 'freqByCate': [92, 22, 38, 10, 19],
 'category': ['全部', '焦點', '要聞', '即時', '生活'],
 

## Save data to csv file

In [24]:
df_data = pd.DataFrame(list(response.items()),columns=['name','value'])

In [25]:
df_data

Unnamed: 0,name,value
0,freqByDate,"[{'x': '2022-05-11', 'y': 3}, {'x': '2022-05-1..."
1,freqByCate,"[92, 22, 38, 10, 19]"
2,category,"[全部, 焦點, 要聞, 即時, 生活]"
3,num_frequency,369
4,num_occurrence,92


In [23]:
## 存成csv格式檔案
df_data.to_csv('ko_wen_je_data.csv',sep=',', index=None)

### Alternative way: using zip

In [24]:
k=list(response.keys())
v=list(response.values())

In [25]:
#list(zip(k,v))

In [26]:
df_data = pd.DataFrame(list(zip(k,v)),columns=['name','value'])
df_data

Unnamed: 0,name,value
0,freqByDate,"[{'x': '2022-03-26', 'y': 6}, {'x': '2022-03-2..."
1,freqByCate,"[368, 49, 3, 294, 0]"
2,category,"[全部, 政治, 產經, 生活, 社會]"
3,num_frequency,1306
4,num_occurrence,368


## Read csv file and convert to dict format

In [27]:
df_data = pd.read_csv('chen_shih_chung_data.csv')

In [28]:
df_data 

Unnamed: 0,name,value
0,freqByDate,"[{'x': '2022-03-26', 'y': 6}, {'x': '2022-03-2..."
1,freqByCate,"[368, 49, 3, 294, 0]"
2,category,"['全部', '政治', '產經', '生活', '社會']"
3,num_frequency,1306
4,num_occurrence,368


In [29]:
# Convert to dictionary format
dict(list(df_data.values))

{'freqByDate': "[{'x': '2022-03-26', 'y': 6}, {'x': '2022-03-27', 'y': 11}, {'x': '2022-03-28', 'y': 10}, {'x': '2022-03-29', 'y': 7}, {'x': '2022-03-30', 'y': 15}, {'x': '2022-03-31', 'y': 9}, {'x': '2022-04-01', 'y': 10}, {'x': '2022-04-02', 'y': 11}, {'x': '2022-04-03', 'y': 8}, {'x': '2022-04-04', 'y': 10}, {'x': '2022-04-05', 'y': 12}, {'x': '2022-04-06', 'y': 13}, {'x': '2022-04-07', 'y': 19}, {'x': '2022-04-08', 'y': 13}, {'x': '2022-04-09', 'y': 14}, {'x': '2022-04-10', 'y': 7}, {'x': '2022-04-11', 'y': 8}, {'x': '2022-04-12', 'y': 16}, {'x': '2022-04-13', 'y': 19}, {'x': '2022-04-14', 'y': 18}, {'x': '2022-04-15', 'y': 15}, {'x': '2022-04-16', 'y': 12}, {'x': '2022-04-17', 'y': 12}, {'x': '2022-04-18', 'y': 17}, {'x': '2022-04-19', 'y': 14}, {'x': '2022-04-20', 'y': 12}, {'x': '2022-04-21', 'y': 15}, {'x': '2022-04-22', 'y': 17}, {'x': '2022-04-23', 'y': 18}]",
 'freqByCate': '[368, 49, 3, 294, 0]',
 'category': "['全部', '政治', '產經', '生活', '社會']",
 'num_frequency': '1306',
 'num

In [30]:
list(df_data.values)

[array(['freqByDate',
        "[{'x': '2022-03-26', 'y': 6}, {'x': '2022-03-27', 'y': 11}, {'x': '2022-03-28', 'y': 10}, {'x': '2022-03-29', 'y': 7}, {'x': '2022-03-30', 'y': 15}, {'x': '2022-03-31', 'y': 9}, {'x': '2022-04-01', 'y': 10}, {'x': '2022-04-02', 'y': 11}, {'x': '2022-04-03', 'y': 8}, {'x': '2022-04-04', 'y': 10}, {'x': '2022-04-05', 'y': 12}, {'x': '2022-04-06', 'y': 13}, {'x': '2022-04-07', 'y': 19}, {'x': '2022-04-08', 'y': 13}, {'x': '2022-04-09', 'y': 14}, {'x': '2022-04-10', 'y': 7}, {'x': '2022-04-11', 'y': 8}, {'x': '2022-04-12', 'y': 16}, {'x': '2022-04-13', 'y': 19}, {'x': '2022-04-14', 'y': 18}, {'x': '2022-04-15', 'y': 15}, {'x': '2022-04-16', 'y': 12}, {'x': '2022-04-17', 'y': 12}, {'x': '2022-04-18', 'y': 17}, {'x': '2022-04-19', 'y': 14}, {'x': '2022-04-20', 'y': 12}, {'x': '2022-04-21', 'y': 15}, {'x': '2022-04-22', 'y': 17}, {'x': '2022-04-23', 'y': 18}]"],
       dtype=object),
 array(['freqByCate', '[368, 49, 3, 294, 0]'], dtype=object),
 array(['category

In [31]:
response = dict(list(df_data.values))


In [32]:
response['num_occurrence']

'368'

In [33]:
type(response['num_occurrence'])


str

In [34]:
type(response['freqByCate'])


str

In [45]:
response['freqByCate'] = eval(response['freqByCate'])


In [46]:
response


{'freqByDate': "[{'x': '2022-03-26', 'y': 6}, {'x': '2022-03-27', 'y': 11}, {'x': '2022-03-28', 'y': 10}, {'x': '2022-03-29', 'y': 7}, {'x': '2022-03-30', 'y': 15}, {'x': '2022-03-31', 'y': 9}, {'x': '2022-04-01', 'y': 10}, {'x': '2022-04-02', 'y': 11}, {'x': '2022-04-03', 'y': 8}, {'x': '2022-04-04', 'y': 10}, {'x': '2022-04-05', 'y': 12}, {'x': '2022-04-06', 'y': 13}, {'x': '2022-04-07', 'y': 19}, {'x': '2022-04-08', 'y': 13}, {'x': '2022-04-09', 'y': 14}, {'x': '2022-04-10', 'y': 7}, {'x': '2022-04-11', 'y': 8}, {'x': '2022-04-12', 'y': 16}, {'x': '2022-04-13', 'y': 19}, {'x': '2022-04-14', 'y': 18}, {'x': '2022-04-15', 'y': 15}, {'x': '2022-04-16', 'y': 12}, {'x': '2022-04-17', 'y': 12}, {'x': '2022-04-18', 'y': 17}, {'x': '2022-04-19', 'y': 14}, {'x': '2022-04-20', 'y': 12}, {'x': '2022-04-21', 'y': 15}, {'x': '2022-04-22', 'y': 17}, {'x': '2022-04-23', 'y': 18}]",
 'freqByCate': [368, 49, 3, 294, 0],
 'category': "['全部', '政治', '產經', '生活', '社會']",
 'num_frequency': '1306',
 'num_o

### # How to convert a list into a dict?

In [35]:
# How to convert a list into a dict?
[['one',[1,2,3]], ['two',2]]

[['one', [1, 2, 3]], ['two', 2]]

In [36]:
# How to convert a list into a dict?
[['one',[1,2,3]], ['two',2]]

[['one', [1, 2, 3]], ['two', 2]]

In [37]:
dict([['one',[1,2,3]], ['two',2]])

{'one': [1, 2, 3], 'two': 2}

## All-in-one function

In [38]:
# Load Data 
import pandas as pd
from datetime import datetime, timedelta

df = pd.read_csv('./news_dataset_preprocessed_for_django.csv',sep='|')

In [39]:

def process_data(keyword, weeks=4):
    end_date = df.date.max()
    # start date
    weeks = 4 # week duration for analysis
    start_date = (datetime.strptime(end_date, '%Y-%m-%d').date() - timedelta(weeks=weeks)).strftime('%Y-%m-%d')

    query_df = df[
        (df['date'] >= start_date) & (df['date'] <= end_date) & df['tokens_v2'].str.contains('|'.join(keyword))]
    # query_df = df[df['tokens_v2'].str.contains('|'.join(keyword))]
    # print(len(query_df))
    freqByDate = get_key_time_freq(query_df, keyword)

    cate_freq, cate_occurrence = count_keyword(query_df, keyword)

    selectedCategories = ['全部', '政治', '生活', '社會']
    #selectedCategories = ['全部', '政治', '兩岸', '產經', '生活', '社會']
    # selectedCategories = ['政治', '產經', '生活', '國際', '社會', '兩岸','全部']

    freqByCate = [cate_occurrence[k] for k in selectedCategories]

    response =  {'freqByDate': freqByDate,
            'freqByCate': freqByCate,
            'category': selectedCategories,
            'num_frequency': int(cate_freq['全部']), # 這關鍵字被提多少次
            'num_occurrence': int(cate_occurrence['全部']) #多少篇提到這關鍵字
            }

    return response

In [40]:
keyword = ['陳時中','時中']
weeks=4
data_response = process_data(keyword, weeks)

In [41]:
df_data = pd.DataFrame(list(data_response.items()),columns=['name','value'])
## 存成csv格式檔案
df_data.to_csv('chen_shih_chung_data.csv',sep=',', index=None)

In [42]:
df_data = pd.read_csv('chen_shih_chung_data.csv')

In [43]:
df_data 

Unnamed: 0,name,value
0,freqByDate,"[{'x': '2022-03-26', 'y': 6}, {'x': '2022-03-2..."
1,freqByCate,"[368, 49, 294, 0]"
2,category,"['全部', '政治', '生活', '社會']"
3,num_frequency,1306
4,num_occurrence,368


# views.py in Django

In [44]:
from django.http import JsonResponse
from django.shortcuts import render
import pandas as pd

def load_data_scchen():
    # Read data from csv file
    df_data = pd.read_csv('app_scchen/dataset/chen_shih_chung_data.csv',sep=',')
    global response
    response = dict(list(df_data.values))
    del df_data

# load pk data
load_data_scchen()

def home(request):
    return render(request,'app_scchen/home.html', response)

print('app_scchen was loaded!')

FileNotFoundError: [Errno 2] No such file or directory: 'app_scchen/dataset/chen_shih_chung_data.csv'

        # Saving data with other file format

        ## Save to text file
        # Save data
        f = open('pk_politician.txt','w')
        f.write(str(data_pk))
        f.close()

        # Load data
        f = open('pk_politician.txt','r')
        data_pk = f.read()

        # dictionary format
        eval(data_pk)
        ## Save to json file
        import json

        # Save data
        with open('pk_politician.txt', 'w') as file:
            json.dump( data_pk, file)

        # Load data
        with open('pk_politician.txt', 'r') as file:
            data_pk = json.load(file)

        # dictionary format
        data_pk

        ## save to mongoDb (A famous NoneSQL)
        from pymongo import MongoClient

        # Save data
        client = MongoClient()
        database = client["cnaNews"]  # SQL: Database Name
        table_KTH = database["pk_politicianTsaiHan"]   # SQL: Table Name

        table_KTH.drop()
        table_KTH.insert_one(data)


        # Load data
        from pymongo import MongoClient
        client = MongoClient()
        database = client["cnaNews"]  # SQL: Database Name
        table_KTH  = database["pkData"]   # SQL: Table Name

        for x in table_KTH.find():
            print(x)

        list(table_KTH.find({}, {'_id': False}))
