# 演習III 第４回 ツイッターテキスト解析

出典：https://qiita.com/e10persona/items/7a7643b266c2bdfbf7d0

Google Colab 用

In [None]:
!apt install aptitude
!aptitude install mecab libmecab-dev mecab-ipadic-utf8 git make curl xz-utils file -y
!pip install mecab-python3==0.7
!apt-get -y install fonts-ipafont-gothic

必要なライブラリのインストール

In [None]:
!pip install tweepy mecab-python3 wordcloud oseti japanize-matplotlib

ライブラリのインポートを行います。

In [39]:
import tweepy
import MeCab
import csv
import json
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import japanize_matplotlib
import warnings
import numpy as np
import pandas as pd
import oseti
import collections
import codecs
import random

## MeCab のテスト

テスト

In [None]:
mecab = MeCab.Tagger("-Ochasen") 
malist = mecab.parse("すもももももももものうち")
print(malist)

## Tweet の取得

In [6]:
CK = 'X6CkomphPSRlb3FyZDJrYU0Z2'                             # Consumer Key
CS = 'VTT4d6vARlrMM90RRrWrqWH9IALSccLUltiGx0v4WA6O9uVnS1'    # Consumer Secret
AT = '5710242-20WdXf5snVQuOThPNEK7KaoYDdhNHodZ57TlcICFuF'    # Access Token
AS = 'jLIt16Pl7hGqv7iRsNnrD7eFTXHJxfPYHIHjpkjhog3oX'         # Accesss Token Secert

auth = tweepy.OAuthHandler(CK, CS)
auth.set_access_token(AT, AS)
api = tweepy.API(auth, wait_on_rate_limit=True)

### 特定ユーザの Tweet を取得

In [7]:
tweets = tweepy.Cursor(api.user_timeline, screen_name="ndanyusi", tweet_mode = 'extended')

### 指定したキーワードを含む Tweet を取得

In [None]:
tweets = tweepy.Cursor(api.search_tweets, q="防衛大 exclude:retweets", include_entities = True, 
tweet_mode = 'extended', lang = 'ja', result_type = 'mixed')

## 結果の表示

In [None]:
for tweet_json in tweets.items(3):
    #print(tweet_json)
    tweet = tweet_json._json
    print(tweet['full_text'].replace('\n',' '))
    print("=================================")

In [None]:
df = pd.DataFrame()
for tweet_json in tweets.items(100):
    tweet = tweet_json._json
    df = pd.concat([df, pd.json_normalize(tweet)], ignore_index=True)
df = df.replace('\n','', regex=True)
df

## 結果の保存

In [10]:
df2 = df[['created_at', 'user.name', 'full_text', 'favorite_count', 'retweet_count', 'user.followers_count']]
df2.to_csv('tweets.csv', index=False, encoding='shift-jis', errors='ignore')

## 結果の読み込み

In [None]:
df2 = pd.read_csv('tweets.csv', encoding='cp932')
df2

## MeCab で分析

### 分かち書き、品詞の抜き出し

In [33]:
words = []
for tweet in df['full_text']:
    node = mecab.parseToNode(tweet)
    while node:
        word = node.surface
        word_type = node.feature.split(",")[0]
 
        # "名詞", "動詞", "形容詞", "副詞"の中で選択したものを抽出
        if word_type in ["名詞", "動詞", "形容詞"]:
            words.append(word)
        node = node.next


### WordCloud として表示

In [None]:
wakati_text = " ".join(words)
fpath = '/usr/share/fonts/truetype/fonts-japanese-gothic.ttf'  # 日本語フォント指定
stop_words = ['https', 't', 'co', 'RT']
wordcloud = WordCloud(
    font_path=fpath,
    width=900, height=600,   # default width=400, height=200
    background_color="white",   # default=”black”
    stopwords=set(stop_words),
    max_words=500,   # default=200
    min_font_size=4,   #default=4
    collocations = False   #default = True
    ).generate(txt)
 
plt.figure(figsize=(15,12))
plt.imshow(wordcloud)
plt.axis("off")
plt.savefig("word_cloud.png")
plt.show()

## ポジ・ネガ分析

In [None]:
analyzer = oseti.Analyzer()
print(analyzer.analyze_detail("最後まで希望を捨てちゃいかん。あきらめたら、そこで試合終了だよ。"))
print(analyzer.analyze_detail("認めたくないものだな。自分自身の、若さゆえの過ちというものを。"))

In [None]:
ave_senti = []
for tweet in df['full_text']:
    print(tweet)
    senti = analyzer.analyze(tweet)
    print(analyzer.analyze_detail(tweet), np.mean(senti))
    ave_senti.append(np.mean(senti))
ave_senti

In [None]:
hist, bins = np.histogram(ave_senti, bins=4)
plt.pie(hist, labels=['ネガ', 'ややネガ', 'ややポジ', 'ポジ'], counterclock=False, startangle=90)

## 感情分析（Transformer 版）

In [None]:
!pip install transformers fugashi ipadic

In [None]:
from transformers import pipeline 
from transformers import AutoModelForSequenceClassification 
from transformers import BertJapaneseTokenizer 

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('daigo/bert-base-japanese-sentiment') 
tokenizer = BertJapaneseTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking') 
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) 

Downloading: 100%|██████████| 1.29k/1.29k [00:00<00:00, 285kB/s]
Downloading: 100%|██████████| 422M/422M [01:49<00:00, 4.06MB/s]
Downloading: 100%|██████████| 252k/252k [00:00<00:00, 376kB/s]
Downloading: 100%|██████████| 110/110 [00:00<00:00, 54.3kB/s]
Downloading: 100%|██████████| 479/479 [00:00<00:00, 235kB/s]


In [None]:
print(nlp("最後まで希望を捨てちゃいかん。あきらめたら、そこで試合終了だよ。"))
print(nlp("認めたくないものだな。自分自身の、若さゆえの過ちというものを。"))

In [None]:
sentiments = []
for tweet in df['full_text']:
    print(tweet)
    senti = nlp(tweet)
    print(senti)
    sentiments.append(senti[0]['score'])
print(sentiments)
print(np.mean(sentiments))

# 課題
各自のテーマで Tweet を分析せよ。