# 清理与分析数据

## 收集

#### 导入需要的库

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import display
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',150)

#### twitter_archive_enhanced 保存为DataFrame。

In [2]:
df_raw = pd.read_csv('twitter-archive-enhanced.txt',sep=',',header=0)

#### image-prediction.tsv 转换为DataFrame

In [3]:
df_img = pd.read_csv('image-predictions.tsv',sep='\t',header=0)

#### tweet_json.txt 转换为DataFrame格式
+ 出现错误：'ValueError: Trailing data'
+ 解决方法：添加参数line=True
+ 参考：https://stackoverflow.com/questions/30088006/loading-a-file-with-more-than-one-line-of-json-into-pandas


In [4]:
df_supl = pd.read_json('tweet_json.txt',lines=True)

## 评估

### 针对twitter-archive-enhanced.csv文件的评估

In [5]:
df_raw.sample(5)

Unnamed: 0,tweet_id,in_reply_to_status_id,in_reply_to_user_id,timestamp,source,text,retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp,expanded_urls,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo
43,884162670584377345,,,2017-07-09 21:29:42 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Meet Yogi. He doesn't have any important dog meetings today he just enjoys looking his best at all times. 12/10 for dangerously dapper doggo https...,,,,https://twitter.com/dog_rates/status/884162670584377345/photo/1,12,10,Yogi,doggo,,,
2068,671138694582165504,,,2015-11-30 01:28:28 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Me running from commitment. 10/10 https://t.co/ycVJyFFkES,,,,https://twitter.com/dog_rates/status/671138694582165504/photo/1,10,10,,,,,
1320,706346369204748288,,,2016-03-06 05:11:12 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Koda. She's a Beneboom Cumberwiggle. 12/10 petable as hell https://t.co/VZV6oMJmU6,,,,"https://twitter.com/dog_rates/status/706346369204748288/photo/1,https://twitter.com/dog_rates/status/706346369204748288/photo/1",12,10,Koda,,,,
1880,675006312288268288,,,2015-12-10 17:37:00 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",Say hello to Mollie. This pic was taken after she bet all her toys on Ronda Rousey. 10/10 hang in there pupper https://t.co/QMmAqA9VqO,,,,https://twitter.com/dog_rates/status/675006312288268288/photo/1,10,10,Mollie,,,pupper,
1768,678389028614488064,,,2015-12-20 01:38:42 +0000,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",This is Bella. She just learned that her final grade in chem was a 92.49 \npoor pupper 11/10 https://t.co/auOoKuoveM,,,,https://twitter.com/dog_rates/status/678389028614488064/photo/1,11,10,Bella,,,pupper,


In [6]:
df_raw.shape

(2356, 17)

In [7]:
df_raw.columns

Index(['tweet_id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'timestamp',
       'source', 'text', 'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_timestamp', 'expanded_urls', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

#### 共有17个特征，2356条记录。

+ tweet_id，是每个tweet的唯一标识
+ 'in_reply_to_status_id',Nullable. If the represented Tweet is a reply, this field will contain the integer representation of the original Tweet’s ID
+ 'in_reply_to_user_id', Nullable. If the represented Tweet is a reply, this field will contain the integer representation of the original Tweet’s author ID. This will not necessarily always be the user directly mentioned in the Tweet.
+ 'timestamp',
+ 'source',Utility used to post the Tweet, as an HTML-formatted string. 
+ 'text',The actual UTF-8 text of the status update.tweet中的文本内容
+ 'retweeted_status_id',
+ 'retweeted_status_user_id',
+ 'retweeted_status_timestamp', 
+ 'expanded_urls',
+ 'rating_numerator',评分的分子部分，一般大于10，也可能存在小于10的情况。
+ 'rating_denominator',评分的分母部分，一般为10
+ 'name',来自tweet的文本内容，判断是狗的名字。
+ 'doggo',一种狗的等级，等级一共有四种。
+ 'floofer',一种狗的等级，等级一共有四种。
+ 'pupper',一种狗的等级，等级一共有四种。
+ 'puppo'，一种狗的等级，等级一共有四种。

参考：
+ https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object

In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), ob

In [9]:
df_raw.isnull().any()

tweet_id                      False
in_reply_to_status_id          True
in_reply_to_user_id            True
timestamp                     False
source                        False
text                          False
retweeted_status_id            True
retweeted_status_user_id       True
retweeted_status_timestamp     True
expanded_urls                  True
rating_numerator              False
rating_denominator            False
name                          False
doggo                         False
floofer                       False
pupper                        False
puppo                         False
dtype: bool

In [10]:
df_raw.duplicated(subset=['tweet_id','name']).sum()

0

In [11]:
df_raw['name'].value_counts().nlargest(20)

None       745
a           55
Charlie     12
Lucy        11
Oliver      11
Cooper      11
Tucker      10
Lola        10
Penny       10
Bo           9
Winston      9
the          8
Sadie        8
Toby         7
Daisy        7
Buddy        7
an           7
Bailey       7
Bella        6
Rusty        6
Name: name, dtype: int64

In [12]:
df_raw[['rating_numerator','rating_denominator']].describe()

Unnamed: 0,rating_numerator,rating_denominator
count,2356.0,2356.0
mean,13.126486,10.455433
std,45.876648,6.745237
min,0.0,0.0
25%,10.0,10.0
50%,11.0,10.0
75%,12.0,10.0
max,1776.0,170.0


In [13]:
df_raw.rating_numerator.value_counts().nlargest(20)

12     558
11     464
10     461
13     351
9      158
8      102
7       55
14      54
5       37
6       32
3       19
4       17
1        9
2        9
420      2
0        2
15       2
75       2
80       1
20       1
Name: rating_numerator, dtype: int64

In [14]:
df_raw.rating_denominator.value_counts()

10     2333
11        3
50        3
80        2
20        2
2         1
16        1
40        1
70        1
15        1
90        1
110       1
120       1
130       1
150       1
170       1
7         1
0         1
Name: rating_denominator, dtype: int64

In [15]:
for i in df_raw.text.values[:5]:
    display(i) 

"This is Phineas. He's a mystical boy. Only ever appears in the hole of a donut. 13/10 https://t.co/MgUWQ76dJU"

"This is Tilly. She's just checking pup on you. Hopes you're doing ok. If not, she's available for pats, snugs, boops, the whole bit. 13/10 https://t.co/0Xxu71qeIV"

'This is Archie. He is a rare Norwegian Pouncing Corgo. Lives in the tall grass. You never know when one may strike. 12/10 https://t.co/wUnZnhtVJB'

'This is Darla. She commenced a snooze mid meal. 13/10 happens to the best of us https://t.co/tD36da7qLQ'

'This is Franklin. He would like you to stop calling him "cute." He is a very fierce shark and should be respected as such. 12/10 #BarkWeek https://t.co/AtUZn91f7f'

In [16]:
df_raw.query("name == 'a'").text.values[:5]

array(['Here is a pupper approaching maximum borkdrive. Zooming at never before seen speeds. 14/10 paw-inspiring af \n(IG: puffie_the_chow) https://t.co/ghXBIIeQZF',
       'Here is a perfect example of someone who has their priorities in order. 13/10 for both owner and Forrest https://t.co/LRyMrU7Wfq',
       'Guys this is getting so out of hand. We only rate dogs. This is a Galapagos Speed Panda. Pls only send dogs... 10/10 https://t.co/8lpAGaZRFn',
       'This is a mighty rare blue-tailed hammer sherk. Human almost lost a limb trying to take these. Be careful guys. 8/10 https://t.co/TGenMeXreW',
       'Viewer discretion is advised. This is a terrible attack in progress. Not even in water (tragic af). 4/10 bad sherk https://t.co/L3U0j14N5R'],
      dtype=object)

In [17]:
df_raw[df_raw.rating_numerator==45].text.values

array(['From left to right:\nCletus, Jerome, Alejandro, Burp, &amp; Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK'],
      dtype=object)

In [18]:
df_raw[df_raw.rating_numerator==1776].text.values

array(["This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh"],
      dtype=object)

In [19]:
df_raw[df_raw.rating_numerator==0].text.values[:]

array(["When you're so blinded by your systematic plagiarism that you forget what day it is. 0/10 https://t.co/YbEJPkg4Ag",
       "PUPDATE: can't see any. Even if I could, I couldn't reach them to pet. 0/10 much disappointment https://t.co/c7WXaB2nqX"],
      dtype=object)

In [20]:
df_raw[df_raw.rating_numerator==420].text.values[:1]

array(['@dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research'],
      dtype=object)

In [21]:
df_raw[df_raw.rating_numerator==20].text.values[:1]

array(["I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible"],
      dtype=object)

In [22]:
df_raw[df_raw.rating_denominator==170].text.values

array(['Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv'],
      dtype=object)

In [23]:
df_raw[(df_raw.in_reply_to_status_id.notnull())&(df_raw.in_reply_to_user_id.notnull())].shape

(78, 17)

In [24]:
df_raw[(df_raw.rating_numerator>20)&(df_raw.rating_denominator>10)].shape

(13, 17)

#### 评估df_raw小结：
+ 'timestamp'，'retweeted_status_timestamp'的数据类型是object
+ 'rating_numerator','rating_denominator',最大值分别为1776和170，分别大于均值13和10。
+  'rating_numerator',存在较大的值，比如45，对应的'rating_denominator'是50，这是5只狗评分的分子和分母。
+ 'in_reply_to_status_id ','in_reply_to_user_id','retweeted_status_id ','retweeted_status_user_id','retweeted_status_timestamp','expanded_urls'存在null
+ 狗狗的等级分布在四个列中。
+ 有些狗没有等级分类。数值是None,不是null
+ 有些狗没有名字，数值是None,而不是null
+ 一些狗的名字是'an','the'或者'a'.
+ timestamp的类型是object

### 针对image-prediction.tsv文件的评估

In [25]:
df_img.sample(5)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
1879,846514051647705089,https://pbs.twimg.com/media/C79sB4xXwAEvwKY.jpg,2,golden_retriever,0.650003,True,Leonberg,0.065199,True,Norfolk_terrier,0.052955,True
996,708349470027751425,https://pbs.twimg.com/media/CdSQFWOWAAApgfq.jpg,1,muzzle,0.24389,False,basenji,0.187158,True,Boston_bull,0.092727,True
2056,888554962724278272,https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg,3,Siberian_husky,0.700377,True,Eskimo_dog,0.166511,True,malamute,0.111411,True
310,671542985629241344,https://pbs.twimg.com/media/CVHMyHMWwAALYXs.jpg,1,Shetland_sheepdog,0.980339,True,collie,0.006693,True,papillon,0.006157,True
623,680583894916304897,https://pbs.twimg.com/media/CXHrcFYWcAEE5_L.jpg,1,tub,0.889801,False,bathtub,0.032351,False,hippopotamus,0.014177,False


In [26]:
df_img.shape

(2075, 12)

In [27]:
df_img.columns

Index(['tweet_id', 'jpg_url', 'img_num', 'p1', 'p1_conf', 'p1_dog', 'p2',
       'p2_conf', 'p2_dog', 'p3', 'p3_conf', 'p3_dog'],
      dtype='object')

#### 共有12个特征，2075条记录。

+ tweet_id，是每个tweet的唯一标识
+ jpg_url 是预测的图像资源链接
+ img_num 最可信的预测结果对应的图像编号.
+ p1 是算法对推特中图片的一号预测 
+ p1_conf 是算法的一号预测的可信度
+ p1_dog 是一号预测该图片是否属于“狗”（有可能是其他物种，比如熊、马等）,True表示图片属于狗
+ p2 是算法对推特中图片预测的第二种可能性
+ p2_conf 是算法的二号预测的可信度
+ p2_dog 是二号预测该图片是否属于“狗”.
+ p3 图片预测的第三种可能性
+ p3_conf 三号预测的可信度
+ p3_dog 三号预测的图片是否属于狗。

参考：
+ udactiy

In [28]:
df_img.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB


In [29]:
df_img.describe()

Unnamed: 0,tweet_id,img_num,p1_conf,p2_conf,p3_conf
count,2075.0,2075.0,2075.0,2075.0,2075.0
mean,7.384514e+17,1.203855,0.594548,0.1345886,0.06032417
std,6.785203e+16,0.561875,0.271174,0.1006657,0.05090593
min,6.660209e+17,1.0,0.044333,1.0113e-08,1.74017e-10
25%,6.764835e+17,1.0,0.364412,0.05388625,0.0162224
50%,7.119988e+17,1.0,0.58823,0.118181,0.0494438
75%,7.932034e+17,1.0,0.843855,0.1955655,0.09180755
max,8.924206e+17,4.0,1.0,0.488014,0.273419


In [30]:
df_img.isna().any()

tweet_id    False
jpg_url     False
img_num     False
p1          False
p1_conf     False
p1_dog      False
p2          False
p2_conf     False
p2_dog      False
p3          False
p3_conf     False
p3_dog      False
dtype: bool

In [31]:
df_img.duplicated(subset=['tweet_id'],keep="first").sum()

0

In [32]:
df_img[(df_img.p1_conf>=df_img.p2_conf)&(df_img.p2_conf>=df_img.p3_conf)|(df_img.p1_conf>=df_img.p3_conf)].shape

(2075, 12)

In [33]:
df_img.img_num.value_counts()

1    1780
2     198
3      66
4      31
Name: img_num, dtype: int64

In [34]:
len(df_img.query("img_num >1"))

295

In [35]:
df_img.p1.str.istitle().sum()

402

In [36]:
df_img.p1.str.islower().sum()

1135

In [37]:
df_img[df_img.p1.str.istitle()].head(1)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
15,666099513787052032,https://pbs.twimg.com/media/CT51-JJUEAA6hV8.jpg,1,Lhasa,0.58233,True,Shih-Tzu,0.166192,True,Dandie_Dinmont,0.089688,True


In [38]:
df_img[~((df_img.p1.str.istitle())|(df_img.p1.str.islower()))].head(1)

Unnamed: 0,tweet_id,jpg_url,img_num,p1,p1_conf,p1_dog,p2,p2_conf,p2_dog,p3,p3_conf,p3_dog
0,666020888022790149,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,1,Welsh_springer_spaniel,0.465074,True,collie,0.156665,True,Shetland_sheepdog,0.061428,True


#### 评估df_img小结：
+ p2,p2_conf,p3,p3_conf,p2_dog,p3_dog这些列，不需要。
+ img_num 表示对应的图片的编号，不需要这个列。
+ p1，狗的类型，存在大小写不一致的情况。



### 针对tweet_json.txt文件的评估

In [39]:
df_supl.sample(5)

Unnamed: 0,contributors,coordinates,created_at,display_text_range,entities,extended_entities,favorite_count,favorited,full_text,geo,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,is_quote_status,lang,place,possibly_sensitive,possibly_sensitive_appealable,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,truncated,user
1025,,,2016-06-22 20:18:30,"[0, 97]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 745712555474169857, 'id_str': '745712555474169857', 'indices': [...","{'media': [{'id': 745712555474169857, 'id_str': '745712555474169857', 'indices': [98, 121], 'media_url': 'http://pbs.twimg.com/media/CllNnkWWMAEDI...",7620,False,This is Percy. He fell asleep at the wheel. Irresponsible af. 7/10 absolute menace on the roadway https://t.co/QHbvtvaw8E,,745712589599014916,745712589599014912,,,,,,False,en,,0.0,0.0,,,,2626,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', '..."
2210,,,2015-11-22 21:41:02,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 668544740393136128, 'id_str': '668544740393136128', 'indices': [...","{'media': [{'id': 668544740393136128, 'id_str': '668544740393136128', 'indices': [117, 140], 'media_url': 'http://pbs.twimg.com/media/CUcl5jeWsAA6...",554,False,It is an honor to rate this pup. He is a Snorklhuahua from Amarillo. A true renaissance dog. Also part Rudolph 10/10 https://t.co/ALNyYuGui7,,668544745690562560,668544745690562560,,,,,,False,en,,0.0,0.0,,,,248,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', '..."
1505,,,2016-01-25 03:16:56,"[0, 101]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 691459697358934016, 'id_str': '691459697358934016', 'indices': [...","{'media': [{'id': 691459697358934016, 'id_str': '691459697358934016', 'indices': [78, 101], 'media_url': 'http://pbs.twimg.com/media/CZiO7mWUEAAa4...",4437,False,Say hello to Leo. He's a Fallopian Puffalope. Precious af. 12/10 would cuddle https://t.co/LZEi0DpRsH,,691459709405118465,691459709405118464,,,,,,False,en,,0.0,0.0,,,,1290,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', '..."
1077,,,2016-06-04 00:08:17,"[0, 110]","{'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 738885039132401664, 'id_str': '738885039132401664', 'indices': [...","{'media': [{'id': 738885039132401664, 'id_str': '738885039132401664', 'indices': [111, 134], 'media_url': 'http://pbs.twimg.com/media/CkEMBz9WYAAG...",4114,False,This is Charles. He's a Nova Scotian Towel Pouncer. Deadly af. Nifty tongue slip. 11/10 would pet with caution https://t.co/EfejX3iRGr,,738885046782832640,738885046782832640,,,,,,False,en,,0.0,0.0,,,,1273,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', '..."
291,,,2017-03-03 01:58:22,"[0, 132]","{'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'GoodDogsGame', 'name': 'Good Dogs', 'id': 827593379009675264, 'id_str': '827593...","{'media': [{'id': 837482239088820224, 'id_str': '837482239088820224', 'indices': [133, 156], 'media_url': 'http://pbs.twimg.com/media/C59VqMaWgAAp...",4189,False,This is Waffles. He's a ship captain in real life and in @GoodDogsGame. Must've gotten to the max level (wink) 13/10 would sail with https://t.co/...,,837482249356513284,837482249356513280,,,,,,False,en,,0.0,0.0,,,,494,False,,"<a href=""http://twitter.com/download/iphone"" rel=""nofollow"">Twitter for iPhone</a>",False,"{'id': 4196983835, 'id_str': '4196983835', 'name': 'SpookyWeRateDogs™', 'screen_name': 'dog_rates', 'location': 'MERCH↴ DM DOGS. WE WILL RATE', '..."


In [40]:
df_supl.shape

(2352, 31)

In [41]:
df_supl.columns

Index(['contributors', 'coordinates', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'favorite_count', 'favorited',
       'full_text', 'geo', 'id', 'id_str', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
       'lang', 'place', 'possibly_sensitive', 'possibly_sensitive_appealable',
       'quoted_status', 'quoted_status_id', 'quoted_status_id_str',
       'retweet_count', 'retweeted', 'retweeted_status', 'source', 'truncated',
       'user'],
      dtype='object')

#### 共有31个特征，2352条记录。

+ favorite_count，tweet被点赞的次数，反映阅读者对此tweet内容是否喜欢。
+ retweet_count，tweet被转发的次数，反映阅读者对此tweet内容喜欢的程度。
+ id,tweet的唯一标识


In [42]:
df_supl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2352 entries, 0 to 2351
Data columns (total 31 columns):
contributors                     0 non-null float64
coordinates                      0 non-null float64
created_at                       2352 non-null datetime64[ns]
display_text_range               2352 non-null object
entities                         2352 non-null object
extended_entities                2073 non-null object
favorite_count                   2352 non-null int64
favorited                        2352 non-null bool
full_text                        2352 non-null object
geo                              0 non-null float64
id                               2352 non-null int64
id_str                           2352 non-null int64
in_reply_to_screen_name          78 non-null object
in_reply_to_status_id            78 non-null float64
in_reply_to_status_id_str        78 non-null float64
in_reply_to_user_id              78 non-null float64
in_reply_to_user_id_str          78 n

In [43]:
df_supl[['favorite_count','retweet_count']].describe()

Unnamed: 0,favorite_count,retweet_count
count,2352.0,2352.0
mean,8109.19898,3134.932398
std,11980.795669,5237.846296
min,0.0,0.0
25%,1417.0,618.0
50%,3596.5,1456.5
75%,10118.0,3628.75
max,132318.0,79116.0


#### 评估df_img小结：
+ 保留favorite_count,retweet_count,id这两个列，其余列不需要。
+ 这个数据集是关于狗的类型，与df_raw都是存储狗这个observation。
+ id的数据类型是int64
+ 考虑是否从entities列中获取id，与现有的id列的数据进行验证



### 评估总结：

#### 质量

#####  **twitter-archive-enhanced数据**
+ 'in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp','expanded_urls'，'retweeted_status_timestamp',,'source'这些列不需要。
+ 有些狗没有名字，数值是None,'an','the'或者'a'.而不是null
+ 有些狗没有等级分类。数值是None,不是null
+ 'rating_numerator','rating_denominator',最大值分别为1776和170，分别大于均值13和10
+ timestamp的类型是object，tweet_id的类型是int64

##### **image-prediction数据**
+ p1，狗的类型，存在大小写不一致的情况。
+ p2,p2_conf,p3,p3_conf,p2_dog,p3_dog.img_num这些列，不需要。

##### **tweet_json数据**
+ 保留favorite_count,retweet_count,id这三个特征列，其余列不需要。


#### 整洁度
+ df_raw,df_img，df_supl的observation都是dog
+ df_raw中狗的等级分布在四个列中。
+ df_img中使用'p1','p1_dog'两个列表述狗的种类


## 清理

### 清理数据质量问题

#### 备份三个数据集

In [44]:
df_raw_clean = df_raw.copy()
df_img_clean = df_img.copy()
df_supl_clean = df_supl.copy()

#### 清理不需要的特征

##### 定义
+ 从df_raw删除下列不需要的列： 'in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id','retweeted_status_user_id','retweeted_status_timestamp','expanded_urls'， 'timestamp'，'retweeted_status_timestamp','source'

+ 从df_img中删除p2,p2_conf,p3,p3_conf,p2_dog,p3_dog.img_num这些列

+ 从df_supl删除除了下列三个列'favorite_count','retweet_count','id'之外的其他列。

##### 代码

In [45]:
df_raw_clean.drop(columns=['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id','retweeted_status_user_id',
                            'retweeted_status_timestamp','expanded_urls','retweeted_status_timestamp','source'],inplace=True)
df_img_clean.drop(columns=['p2','p2_conf','p3','p3_conf','p2_dog','p3_dog','img_num'],inplace=True)
df_supl_clean = df_supl_clean[['id','favorite_count','retweet_count']]

##### 测试

In [46]:
df_raw_clean.columns

Index(['tweet_id', 'timestamp', 'text', 'rating_numerator',
       'rating_denominator', 'name', 'doggo', 'floofer', 'pupper', 'puppo'],
      dtype='object')

In [47]:
df_img_clean.columns

Index(['tweet_id', 'jpg_url', 'p1', 'p1_conf', 'p1_dog'], dtype='object')

In [48]:
df_supl_clean.columns

Index(['id', 'favorite_count', 'retweet_count'], dtype='object')

#### 清理空值

##### 定义
+ replace方法对df_raw中的'name'中的'a','an','the'替换为"None"
+ 使用dropna方法清理空值以及相应的记录。

##### 代码

In [49]:
df_raw_clean.name.replace(['a','an','the'],"None",inplace=True)
df_raw_clean.dropna(axis='index',inplace=True)

##### 测试

In [50]:
df_raw_clean.name.isnull().sum()

0

#### 清理数据类型

##### 定义
+ 使用astype或者to_datatime方法将df_raw中的'timestamp'的数据类型改为Datetime类型
+ 使用astype方法将df_raw,df_img中的'tweet_id'，以及df_supl中的'id'的数据类型改为str类型

##### 代码

In [51]:
df_raw_clean.timestamp = pd.to_datetime(df_raw_clean.timestamp)
df_raw_clean.tweet_id = df_raw_clean.tweet_id.astype(str)
df_img_clean.tweet_id = df_img_clean.tweet_id.astype(str)
df_supl_clean.id = df_supl_clean.id.astype(str)

##### 测试

In [52]:
df_raw_clean.dtypes

tweet_id                      object
timestamp             datetime64[ns]
text                          object
rating_numerator               int64
rating_denominator             int64
name                          object
doggo                         object
floofer                       object
pupper                        object
puppo                         object
dtype: object

In [53]:
df_img_clean.dtypes

tweet_id     object
jpg_url      object
p1           object
p1_conf     float64
p1_dog         bool
dtype: object

In [54]:
df_supl_clean.dtypes

id                object
favorite_count     int64
retweet_count      int64
dtype: object

#### 清理一致性问题

##### 定义
+ 使用str.title()将df_img中'p1'中狗的类型的格式进行统一化处理。

##### 代码

In [55]:
df_img_clean.p1=df_img_clean.p1.str.title()

##### 测试

In [56]:
df_img_clean.p1.str.istitle().shape[0]== df_img_clean.shape[0]

True

#### 清理异常值问题

##### 定义

+ 过滤掉df_raw中'rating_numerator'以及'rating_denominator'中分子大于20，分母大于10的记录。

##### 代码

In [57]:
df_raw_clean = df_raw_clean[~((df_raw_clean.rating_numerator>20)|(df_raw_clean.rating_denominator>10))]

##### 测试

In [58]:
df_raw_clean.describe()

Unnamed: 0,rating_numerator,rating_denominator
count,2325.0,2325.0
mean,10.702796,9.996559
std,2.188575,0.165912
min,0.0,2.0
25%,10.0,10.0
50%,11.0,10.0
75%,12.0,10.0
max,17.0,10.0


In [59]:
df_raw_clean[df_raw_clean.rating_denominator==0]

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,doggo,floofer,pupper,puppo


### 清理数据整洁度

#### 清理多个列表示一个特征的问题

##### 定义
+ 过滤掉df_img文件中'p1_dog'为False的记录
+ 创建df_temp，是df_raw的切片，包含'doggo','floofer','pupper','puppo'四个列，并添加'stage'列
+ 使用apply方法将df_raw中'doggo','floofer','pupper','puppo'四个列合并为'stage',
+ 使用drop方法，删除df_raw中的'doggo','floofer','pupper','puppo'四个列，并将df_temp中的'stage'合并到df_raw中。

##### 代码

In [60]:
df_img_clean = df_img_clean[df_img_clean.p1_dog]

In [61]:
df_temp= df_raw_clean.loc[:,['doggo','floofer','pupper','puppo']]
df_temp["stage"]=np.nan
stage_list = df_temp.columns[:-1].tolist()
def to_stage(df):
    for c in stage_list:
        if df[c] == c:
            return c
    return "None"
df_temp["stage"]=df_temp.apply(to_stage,axis=1)
df_temp.drop(['doggo','floofer','pupper','puppo'],axis=1,inplace=True)
df_raw_clean = pd.concat([df_raw_clean,df_temp],axis=1)
df_raw_clean.drop(['doggo','floofer','pupper','puppo'],axis=1,inplace=True)

##### 测试

In [62]:
df_img_clean.p1_dog.value_counts()

True    1532
Name: p1_dog, dtype: int64

In [63]:
df_raw_clean.columns

Index(['tweet_id', 'timestamp', 'text', 'rating_numerator',
       'rating_denominator', 'name', 'stage'],
      dtype='object')

In [64]:
df_raw_clean.sample(5)

Unnamed: 0,tweet_id,timestamp,text,rating_numerator,rating_denominator,name,stage
616,796484825502875648,2016-11-09 22:49:15,Here's a sleepy doggo that requested some assistance. 12/10 would carry everywhere https://t.co/bvkkqOjNDV,12,10,,doggo
121,869596645499047938,2017-05-30 16:49:31,This is Scout. He just graduated. Officially a doggo now. Have fun with taxes and losing sight of your ambitions. 12/10 would throw cap for https:...,12,10,Scout,doggo
1798,677228873407442944,2015-12-16 20:48:40,Say hello to Chuq. He just wants to fit in. 11/10 https://t.co/hGkMCjZzn4,11,10,Chuq,
1517,690938899477221376,2016-01-23 16:47:25,"She thought the sunset was pretty, but I thought she was prettier. 10/10 https://t.co/HSL3mnP5NX",10,10,,
1838,675898130735476737,2015-12-13 04:40:46,I'm sure you've all seen this pupper. Not prepared at all for the flying disc of terror. 10/10 https://t.co/G0pQiFGM7O,10,10,,pupper


#### 清理多个数据表描述统一个observation的问题

##### 定义
+ 使用drop方法清理df_img中不必要的列:'p1_dog','p1_conf'
+ 使用drop方法清理df_raw中不必要的列：'text'
+ 使用rename方法，将df_supl中的'id'改为'tweet_id'
+ 使用merge函数，将df_raw，df_supl,df_img,三个数据集，按照'tweed_id'进行左连接。
+ 使用rename方法，将df_img中的'p1'改为'dog_type'

##### 代码

In [65]:
df_img_clean.drop(columns=['p1_dog','p1_conf'],inplace=True)
df_raw_clean.drop(columns=['text'],inplace=True)
df_supl_clean.rename(columns={'id':'tweet_id'},inplace=True)

In [66]:
df_archive_master = pd.merge(df_raw_clean,df_supl_clean,on='tweet_id',how='left')
df_archive_master = pd.merge(df_archive_master,df_img_clean,on='tweet_id',how='left')

In [67]:
df_archive_master.rename(columns={'p1':'dog_type'},inplace=True)

##### 测试

In [68]:
df_archive_master.head(5)

Unnamed: 0,tweet_id,timestamp,rating_numerator,rating_denominator,name,stage,favorite_count,retweet_count,jpg_url,dog_type
0,892420643555336193,2017-08-01 16:23:56,13,10,Phineas,,39492.0,8842.0,,
1,892177421306343426,2017-08-01 00:17:27,13,10,Tilly,,33786.0,6480.0,https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg,Chihuahua
2,891815181378084864,2017-07-31 00:18:03,12,10,Archie,,25445.0,4301.0,https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg,Chihuahua
3,891689557279858688,2017-07-30 15:58:51,13,10,Darla,,42863.0,8925.0,,
4,891327558926688256,2017-07-29 16:00:24,12,10,Franklin,,41016.0,9721.0,https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg,Basset


In [69]:
df_archive_master.tail(5)

Unnamed: 0,tweet_id,timestamp,rating_numerator,rating_denominator,name,stage,favorite_count,retweet_count,jpg_url,dog_type
2320,666049248165822465,2015-11-16 00:24:50,5,10,,,111.0,41.0,https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg,Miniature_Pinscher
2321,666044226329800704,2015-11-16 00:04:52,6,10,,,309.0,147.0,https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg,Rhodesian_Ridgeback
2322,666033412701032449,2015-11-15 23:21:54,9,10,,,128.0,47.0,https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg,German_Shepherd
2323,666029285002620928,2015-11-15 23:05:30,7,10,,,132.0,48.0,https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg,Redbone
2324,666020888022790149,2015-11-15 22:32:08,8,10,,,2528.0,530.0,https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg,Welsh_Springer_Spaniel


### 存储清理后的主数据集

In [74]:
# 将清理后的主数据集保存为 csv 文件或者 SQLite 数据库文件
df_archive_master.to_csv('twitter_archive_master.csv',encoding='utf-8')

## 分析和可视化

> 提示：
- 在 Notebook 使用 pandas 或 SQL 分析主数据集，并生成至少三（3）个独立的结论。
- 在 Notebook 中使用 Python 绘图库或在 Tableau 中至少生成一（1）个可视化图表。


In [71]:
# 分析或可视化代码


与上面分析或可视化相关的见解或说明

In [72]:
# 分析或可视化代码


与上面分析或可视化相关的见解或说明

In [73]:
# 你需要添加更多的 code cell 和 markdown cell 来完成所有分析和可视化


更多说明或总结等

> 提示：在完成 Notebook 的所有内容之后，还需要完成两篇文本和图片组成的 PDF 报告。因为这两篇报告中只是文字和图片，不需要包含代码，你可以使用文字编辑软件，比如 Word 来完成：
- 创建一个 300-600 字的书面报告，命名为 `wrangle_report.pdf`，在该报告中简要描述你的数据整理过程。这份报告可以看作是一份内部文档，供你的团队成员查看交流。
- 创建一个 250 字以上的书面报告，命名为 `act_report.pdf`，在该报告中，你可以与读者交流观点，展示你使用整理过的数据生成的可视化图表。这份报告可以看作是一份外部文档，如博客帖子或杂志文章。