# Additional data for Final Project

From: https://data2.unhcr.org/en/situations/ukraine

Cumulative Number of Migrants by Date (from 02/24 to 03/15)

In [1]:
# Dependencies
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [2]:
tweets_df = pd.read_csv('resources/tweet_sentiments_consolidated.csv', encoding="utf-8", index_col=False)
#tweets_df.set_index(tweets_df['tweetid'], inplace=True)
tweets_df.head()

Unnamed: 0,location,tweetcreatedts,text,country,neg,neu,pos,compound,sentiment
0,India,2022-03-01,urges government to rescue student along wit...,india,0.0,0.769,0.231,0.5106,positive
1,"Stoney Creek, Ontario",2022-03-01,moscow underground sometimes the act of resis...,canada,0.0,0.88,0.12,0.3818,positive
2,"Los Angeles, CA",2022-03-01,via video game industry shares support for u...,usa,0.219,0.506,0.275,0.0,neutral
3,Australia,2022-03-01,tomlinson had planned stops in kyiv on july ...,australia,0.127,0.873,0.0,-0.1531,negative
4,Ukraine,2022-03-01,stop russian aggression against close the ...,ukraine,0.32,0.558,0.122,-0.4404,negative


In [3]:
casualties_df = pd.read_csv('resources/Ukraine_Casualties_daily.csv', index_col=False)
casualties_df.head()

Unnamed: 0,Date,total civilian casualties
0,2/24/2022,30
1,2/25/2022,25
2,2/26/2022,185
3,2/27/2022,0
4,2/28/2022,166


In [4]:
# remove spaces from column name
casualties_df.rename(columns = {'total civilian casualties':'casualties'}, inplace = True)

In [5]:
# Fomatting all dates to international standards
casualties_df['Date'] = pd.to_datetime(casualties_df['Date'])
casualties_df['Date'] = casualties_df['Date'].dt.strftime('%Y-%m-%d')
casualties_df.head(20)

Unnamed: 0,Date,casualties
0,2022-02-24,30
1,2022-02-25,25
2,2022-02-26,185
3,2022-02-27,0
4,2022-02-28,166
5,2022-03-01,144
6,2022-03-02,202
7,2022-03-03,86
8,2022-03-04,168
9,2022-03-05,52


## Data obtained from the UN Refugee Agency
Reference at top of this file

In [6]:
migration_data = [{
"data_date": "2022-02-24",
"unix_timestamp": 1645660800,
"individuals": 84681
},
{
"data_date": "2022-02-25",
"unix_timestamp": 1645747200,
"individuals": 192982
},
{
"data_date": "2022-02-26",
"unix_timestamp": 1645833600,
"individuals": 341301
},
{
"data_date": "2022-02-27",
"unix_timestamp": 1645920000,
"individuals": 509665
},
{
"data_date": "2022-02-28",
"unix_timestamp": 1646006400,
"individuals": 672139
},
{
"data_date": "2022-03-01",
"unix_timestamp": 1646092800,
"individuals": 838829
},
{
"data_date": "2022-03-02",
"unix_timestamp": 1646179200,
"individuals": 1033312
},
{
"data_date": "2022-03-03",
"unix_timestamp": 1646265600,
"individuals": 1201217
},
{
"data_date": "2022-03-04",
"unix_timestamp": 1646352000,
"individuals": 1373316
},
{
"data_date": "2022-03-05",
"unix_timestamp": 1646438400,
"individuals": 1570836
},
{
"data_date": "2022-03-06",
"unix_timestamp": 1646524800,
"individuals": 1779996
},
{
"data_date": "2022-03-07",
"unix_timestamp": 1646611200,
"individuals": 1987656
},
{
"data_date": "2022-03-08",
"unix_timestamp": 1646697600,
"individuals": 2172694
},
{
"data_date": "2022-03-09",
"unix_timestamp": 1646784000,
"individuals": 2343537
},
{
"data_date": "2022-03-10",
"unix_timestamp": 1646870400,
"individuals": 2478204
},
{
"data_date": "2022-03-11",
"unix_timestamp": 1646956800,
"individuals": 2597460
},
{
"data_date": "2022-03-12",
"unix_timestamp": 1647043200,
"individuals": 2713282
},
{
"data_date": "2022-03-13",
"unix_timestamp": 1647129600,
"individuals": 2838599
},
{
"data_date": "2022-03-14",
"unix_timestamp": 1647216000,
"individuals": 2952275
},
{
"data_date": "2022-03-15",
"unix_timestamp": 1647302400,
"individuals": 3000381
}]

In [7]:
migration_df = pd.DataFrame(migration_data)
migration_df.drop(['unix_timestamp'], axis = 1, inplace=True)

In [8]:
# converting cumulative data to daily numbers
migration_df['people'] = migration_df['individuals'].diff()
migration_df['people'][0] = migration_df['individuals'][0]
migration_df.drop(['individuals'], axis = 1, inplace=True)
migration_df.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,data_date,people
0,2022-02-24,84681.0
1,2022-02-25,108301.0
2,2022-02-26,148319.0
3,2022-02-27,168364.0
4,2022-02-28,162474.0
5,2022-03-01,166690.0
6,2022-03-02,194483.0
7,2022-03-03,167905.0
8,2022-03-04,172099.0
9,2022-03-05,197520.0


In [9]:
import sqlite3
conn = sqlite3.connect('resources/ukraine_analysis.sqlite')
tweets_df.to_sql('tweets', conn, if_exists='replace', index=False)
migration_df.to_sql('migration', conn, if_exists='replace', index=False)
casualties_df.to_sql('casualties', conn, if_exists='replace', index=False)


In [10]:
# Create cursor object
c = conn.cursor()

In [11]:

sql1 = '''DROP TABLE migration_sentiments;'''
c.execute(sql1)
  
# Query for INNER JOIN
sql2 = '''CREATE TABLE migration_sentiments (data_date DATE NOT NULL, people FLOAT, casualties INT, compound_sentiment FLOAT);'''
  
# Executing the query
c.execute(sql2)

<sqlite3.Cursor at 0x2a8271e76c0>

In [12]:
sql2 = '''INSERT INTO migration_sentiments SELECT mg.data_date, mg.people, ca.casualties, AVG(tw.compound)
        FROM migration as mg
        LEFT JOIN casualties as ca
            ON mg.data_date = ca.Date
        LEFT JOIN tweets as tw
            ON tw.tweetcreatedts = mg.data_date
        GROUP BY mg.data_date;'''

c.execute(sql2)

<sqlite3.Cursor at 0x2a8271e76c0>

In [13]:
pd.read_sql('select * from migration_sentiments', conn)

Unnamed: 0,data_date,people,casualties,compound_sentiment
0,2022-02-24,84681.0,30,-0.097749
1,2022-02-25,108301.0,25,-0.053259
2,2022-02-26,148319.0,185,-0.002411
3,2022-02-27,168364.0,0,-0.022003
4,2022-02-28,162474.0,166,-0.065052
5,2022-03-01,166690.0,144,-0.102634
6,2022-03-02,194483.0,202,-0.081891
7,2022-03-03,167905.0,86,-0.071057
8,2022-03-04,172099.0,168,-0.10366
9,2022-03-05,197520.0,52,-0.034918


In [14]:
#close out the connection
c.close()
conn.close()