This is a repository created by Lei Huang to record Leetcode SQL practice.
sql
select FirstName, LastName, City, State
from Person P left join Address A on P.PersonId = A.PersonId
Python
table = Person.merge(Address, left_on = ['PersonId'], right_on = ['PersonId'], how = 'left')
table['FirstName', 'LastName', 'City', 'State']
sql
#if there is no tie and consider null value
V1: select (select distinct salary from Employee order by salary desc limit 1,1) as SecondHighestSalary
V2:
ifnull(x,y),若x不为空则返回x,否则返回y,这道题y=null
limit x,y,找到对应的记录就停止
distinct,过滤关键字
select
ifnull
(
(select distinct Salary
from Employee
order by Salary desc
limit 1,1),
null
)as 'SecondHighestSalary'
#if there is a tie
V1:
select max(salary) SecondHighestSalary
from employee
where salary < (select max(salary) from employee)
V2:
SELECT max(salary) SecondHighestSalary
FROM (SELECT salary, DENSE_RANK() OVER (ORDER BY salary DESC) rank_num
FROM Employee
)
WHERE rank_num = 2
python
employee['Salary'].drop_duplicates().loc[employee['salary_rank'] == 1]
sql
# V1: group by to drop duplicate values, or you can use distinct
CREATE FUNCTION getNthHighestSalary(N INT) RETURNS INT
BEGIN
set N := N-1;
RETURN (
# Write your MySQL query statement below.
select Salary
from Employee
group by Salary
order by Salary desc
limit N,1
);
end
#v2:Find out the n-1 salary by getting n-1 salaries higher than this salary
CREATE FUNCTION getNthHighestSalary(N INT) RETURNS INT
BEGIN
set N := N-1;
RETURN (
# Write your MySQL query statement below.
select distinct e1.Salary
from Employee e1
where (select count(distinct Salary) from Employee e2 where e2.Salary > e1.Salary) = N
);
end
#v3: count how many salary is higher than this salary
CREATE FUNCTION getNthHighestSalary(N INT) RETURNS INT
BEGIN
set N := N-1;
RETURN (
# Write your MySQL query statement below.
select distinct e1.Salary
from Employee e1 left join Employee e2 on e1.Salary < e2.Salary
group by e1.Salary
having count(distinct e2.Salary) = N
);
end
#v4: window function
#row_number(): 同薪不同名,相当于行号,例如3000、2000、2000、1000排名后为1、2、3、4
#rank(): 同薪同名,有跳级,例如3000、2000、2000、1000排名后为1、2、2、4
#dense_rank(): 同薪同名,无跳级,例如3000、2000、2000、1000排名后为1、2、2、3
CREATE FUNCTION getNthHighestSalary(N INT) RETURNS INT
BEGIN
RETURN (
# Write your MySQL query statement below.
select distinct Salary
from (select Salary, dense_rank() over (order by Salary Desc) as RN
from Employee) E
where RN = N
);
end
Python
def getNthHighestSalary(N):
employee['salary_rank'] = employee['Salary'].rank(method = 'dense', ascending = 0)
Salary = employee['Salary'].drop_duplicates().loc[employee['salary_rank'] == N]
return Salary
sql
select Score, RN as Rank
from (select Score,
dense_rank() over (order by Score desc) as RN
from Scores) s
order by RN
#v2
select a.Score as Score,
(select count(distinct b.Score) from Scores b where b.Score >= a.Score) as Rank
from Scores a
order by a.Score DESC
python
Scores['rank'] = Scores['Score'].rank(
= 'dense', ascending = 0)
Scores[['rank','Score']].sort_values(by= ['rank'])
sql
select distinct l1.Num as ConsecutiveNums
from Logs l1, Logs l2, Logs l3
where l1.Id = l2.Id - 1 and l2.Id = l3.Id - 1 and l1.Num = l2.Num and l2.Num = l3.Num
#window function
select distinct(num) ConsecutiveNums
from (
select num,(row_number() over(order by id )-row_number() over(partition by num order by id)) as rn
from Logs
) tmp
group by rn,num
having count(rn)>=3;
#lead and lag solutions
with x as
(select id,
lead(Num,1,0) over (order by id asc) Lead,
lag(Num,1,0) over (order by id asc) Lag
from Logs
)
select
distinct Num as "ConsecutiveNums"
from
Logs L
join x y on L.id = y.id and y.Lead = L.Num and y.Lag = L.Num;
python
#translating window function
Logs['rn'] = Logs['Id'].rank(method = 'first')
Logs['rn1'] = Logs.groupby(['Num'])['Id'].rank(method = 'first')
Logs['dif'] = Logs['rn']-Logs['rn1']
Logs1 = Logs.groupby(['dif', 'Num'])['dif'].count().to_frame('count').reset_index()
Logs1[Logs1['count'] >= 3]['Num'].drop_duplicates()
#transating the three tables join
#???
sql
select e2.Name as Employee
from Employee e1 join Employee e2 on e1.Id = e2.ManagerId and e1.Salary < e2.Salary
python
sql
select Email
from Person
group by Email
having count(*) > 1
python
New = Person.groupby(['Email'])['Id'].count().to_frame('count').reset_index()
New[New['count'] > 1]['Email']
sql
select c.Name as Customers
from Customers c
left join Orders o on o.CustomerId = c.Id
where o.Id is null
python
new_df = pd.merge(Customers, Orders, how='left', left_on=['id'], right_on = ['id'], suffixes = ('_t1','_t2'))
new_df[new_df['id_t2'] is null]['Name']
sql
select D.Name as Department, E.Name as Employee, E.Salary
from (select *,
dense_rank() over (partition by DepartmentId order by Salary desc) as rn
from Employee) E join
Department D on E.DepartmentId = D.Id
where rn = 1
order by E.Salary
#v2:
select d.Name Department,e.Name Employee,Salary
from Employee e
join Department d
on e.DepartmentId=d.Id
where(e.DepartmentId , Salary) IN(
select DepartmentId, max(salary)
from Employee
group by DepartmentId
);
python
Employee = Employee.groupby(['DepartmentId'])['Salary'].rank(method = 'dense', ascending = 0)
dat = pd.merge(Employee, Department, how = 'inner', left_on = 'DepartmentId', right_on = 'Id',suffixes = ('_t1','_t2'))
result = dat[dat['rn'] = 1][['Name_t2','Name_t1','Salary']]
result.columns = ['Department', 'Employee','Salary']
sql
select D.Name as Department, E.Name as Employee, E.Salary
from (select *,
dense_rank() over (partition by DepartmentId order by Salary desc) as rn
from Employee) E join
Department D on E.DepartmentId = D.Id
where rn <= 3
order by E.Salary
python
Employee = Employee.groupby(['DepartmentId'])['Salary'].rank(method = 'dense', ascending = 0)
dat = pd.merge(Employee, Department, how = 'inner', left_on = 'DepartmentId', right_on = 'Id',suffixes = ('_t1','_t2'))
result = dat[dat['rn'] <= 3][['Name_t2','Name_t1','Salary']]
result.columns = ['Department', 'Employee','Salary']
sql
DELETE p1 FROM Person p1,
Person p2
WHERE
p1.Email = p2.Email AND p1.Id > p2.Id
#v2: not in subquery
DELETE from Person
Where Id not in (
Select Id
From(
Select MIN(Id) as id
From Person
Group by Email
) t
)
python
P1 = Person.groupby(['Email'])['Id'].min().to_frame('id').reset_index()
Person[~Person.Id.isin(P1['id'])]
#should be Person.Id since isin need series type
#.isin(P1['id']) should be P1['id'] since inside isin need list type
sql
select w1.Id
from Weather w1, Weather w2
where dateDiff(w1.RecordDate,w2.RecordDate) = 1
and w1.Temperature > w2.Temperature
python
#datediff in python
from datetime import datetime
def daysdiff(d1, d2):
d1 = datetime.strptime(d1, "%Y-%m-%d")
d2 = datetime.strptime(d2, "%Y-%m-%d")
return (d1 - d2).days
sql
SELECT
request_at 'Day', round(avg(Status!='completed'), 2) 'Cancellation Rate'
FROM
trips t JOIN users u1 ON (t.client_id = u1.users_id AND u1.banned = 'No')
JOIN users u2 ON (t.driver_id = u2.users_id AND u2.banned = 'No')
WHERE
request_at BETWEEN '2013-10-01' AND '2013-10-03'
GROUP BY
request_at
Python
Trips = Trips[~Trips.Client_Id.isin.User[User['Banned'] == 'Yes']['Users_Id']]
Trips = Trips[~Trips.Driver_Id.isin.User[User['Banned'] == 'Yes']['Users_Id']]
Trips = Trips[(Trips.Request_at >= '2013-10-01') & (Trips.Request_at <= '2013-10-03')]
#case when
Trips.Status[Trips.Status == 'Completed'] = 0
Trips.Status[Trips.Status != 'Completed'] = 1
dat = Trips.groupby('Request_at').agg({'sum':sum, 'count': count}).to_frame().reset_index
dat['Cancellation Rate'] = round(dat.sum/dat.count,2)
sql
select player_id, event_date as first_login
from (select *,
row_number() over (partition by player_id order by event_date) as rn
from Activity) S
where rn = 1
python
dat = Activity.groupby(['player_id'])['event_date'].rank(method = first)
dat[dat['rn'] == 1][['player_id', 'event_date']]
sql
select player_id, device_id
from activity
where (player_id, event_date) in
(select player_id, min(event_date)
from activity
group by player_id)
sql
select A1.player_id, A1.event_date, sum(A2.games_played) as games_played_so_far
from Activity A1 left join Activity A2 on A1.player_id = A2.player_id and A1.event_date >= A2.event_date
group by A1.player_id, A1.event_date
order by A1.player_id, A1.event_date
python Perform an asof merge. This is similar to a left-join except that we match on nearest key rather than equal keys.
sql
--注意这道题求的是首日注册后第二天连续登录的.不是任意两天连续登录就行.
#V1:
select round((select count(distinct(A1.player_id))
from (select player_id, min(event_date) as event_date from Activity group by player_id) A1 join Activity A2 on A1.player_id = A2.player_id and datediff(A2.event_date, A1.event_date) = 1)/count(distinct player_id),2) as fraction
from Activity
#V2:得出所有玩家次日登录时间,看原数据中是否存在,统计存在的个数,除以总人数。
SELECT
ROUND(COUNT(DISTINCT player_id)/(SELECT COUNT(distinct player_id) FROM Activity),
2) AS fraction
FROM
Activity
WHERE
(player_id,event_date)
IN
(SELECT
player_id,
Date(min(event_date)+1)
FROM Activity
GROUP BY player_id);
python
A1 = Activity.groupby(['player_id'])['event_date'].min().to_frame('Date').reset_index()
A1['date1'] = A1['Date'].apply(lambda x: datetime.strptime(x, '%Y%m%d'))
A1['Date1']= A1['Date'] + timedelta(days=1)
index1 = pd.MultiIndex.from_arrays([A1[col] for col in ['player_id','Date1']])
index2 = pd.MultiIndex.from_arrays([Activity[col] for col in ['player_id','event_date']])
New = Activity.loc[index2.isin(index1)]
fraction = round(New.player_id.nunique()/Activity.player_id.nunique(),2)
#
# 595. Big Countries
sql
```sql
select name, population, area
from World
where area > 3000000 or population > 25000000
order by name
python
world = World[(world['area'] > 3000000) or (world['population'] > 25000000)]
world.sort_values(by=[‘name’])
sql
SELECT Id, Company, Salary FROM
(SELECT Id, Company, Salary, COUNT(Salary) OVER (PARTITION BY Company) AS CN,
ROW_NUMBER() OVER (PARTITION BY Company ORDER BY Salary) AS RN FROM Employee) T
WHERE RN = (CN+1)/2 OR RN = (CN+2)/2
sql
select Name
from Employee
where Id in (select ManagerId
from Employee
group by ManagerId
having count(distinct Id) >= 5
#v2:self join
select e2.Name
from employee e1, employee e2
where e1.ManagerId = e2.Id
group by e2.Id
having count(*) >= 5
python
dat = Employee.groupby(['ManagerId'])['Id'].nunique().to_frame('count')
dat1 = Employee[Employee.Id.isin(dat[dat['count'] >= 5].index)].Name
sql
#建两个累计频率,用员工薪水中位数的思路来做
select
avg(t.number) as median
from
(
select
n1.number,
n1.frequency,
(select sum(frequency) from numbers n2 where n2.number<=n1.number) as asc_frequency,
(select sum(frequency) from numbers n3 where n3.number>=n1.number) as desc_frequency
from numbers n1
) t
where t.asc_frequency>= (select sum(frequency) from numbers)/2
and t.desc_frequency>= (select sum(frequency) from numbers)/2
sql
select Name
from Candidate
where id in (
select CandidateId
from Vote
group by CandidateId
having count(distinct id) >= all(select count(distinct id)
from Vote
group by CandidateId))
python
dat = Vote.groupby(['CandidateId'])['id'].nunique().to_frame('count').reset_index
max = dat['count'].max()
Candidate[Candidate.id.isin[dat[dat['count'] >= max].CandidateId]].Name
sql
select question_id as survey_log
from
(select question_id, sum(case when action = 'answer' then 1 else 0 end)/sum(case when action = 'show' then 1 else 0 end) as rate
from survey_log
group by question_id) s
order by rate desc
limit 1
python
dat = survey_log.groupby(['question_id'])['action'].value_counts().to_frame('count').reset_index()
...???
sql
#v1: window function
select Id, Month, total_salary as salary
from (select *,
sum(Salary) over (partition by Id order by Month rows 2 preceding) as total_salary,
row_number() over (partition by Id order by Month desc) as RN
from Employee) E
where RN > 1
order by Id, Month desc
#v2: self join and exclude max value
select e1.Id, e1.Month, sum(e2.Salary) as Salary
from Employee e1 join Employee e2
on e1.Id = e2.Id
and e1.Month >= e2.Month
and e1.Month < e2.Month + 3
where (e1.Id,e1.Month) not in (select Id, max(Month) as Month from Employee group by Id)
group by e1.Id, e1.Month
order by e1.Id, e1.Month desc
python
???
sql
select d.dept_name, ifnull(count(distinct s.student_id),0) as student_number
from student s right join department d on d.dept_id = s.dept_id
group by d.dept_id
order by student_number desc
python
dat = student.merge(department,how = 'right', left_on = 'dept_id', right_on = 'dept_id' )
dat1 = dat.groupby(['dept_name'])['student_id'].nunique().to_frame('student_number').reset_index()
dat1 = dat1.sort_values(by=[‘student_number’],asending = 0)
sql
select sum(TIV_2016) as TIV_2016
from insurance
where concat(lat,lon) in (select concat(lat,lon)
from insurance
group by concat(lat,lon)
having count(concat(lat,lon)) <= 1)
and
tiv_2015 in (select tiv_2015
from insurance
group by tiv_2015
having count(tiv_2015) > 1)
python
insurance['concat'] = insurance['lat'].astype(str) + insurance['lon'].astype(str)
#v2: insurance['concat'] = insurance.lat.str.cat(df.lon.str)
dat1 = insurance.groupby(['concat']).count().to_frame('count').reset_index
dat1 = dat1[dat1['count'] <= 1]
dat2 = insurance.groupby(['tiv_2015']).count().to_frame('count').reset_index
dat2 = dat1[dat1['tiv_2015'] > 1]
dat3 = insurance[(insurance.concat.isin(dat1['concat'])) & (insurance.tiv_2015.isin(dat2['tiv_2015']))]
dat3.TIV_2016.sum()
sql
select round(ifnull(count(distinct b.requester_id, b.accepter_id)/count(distinct a.sender_id, a.send_to_id),0), 2) accept_rate
from friend_request a, request_accepted b
select round(
ifnull(
(select count(distinct requester_id ,accepter_id) from request_accepted) /
(select count(distinct sender_id ,send_to_id) from friend_request)
,0)
,2) as accept_rate ;
sql
select class
from courses
group by class
having count(distinct(student)) >= 5
python
dat = courses.groupby(['class'])['student'].nunique().to_frame('count').reset_index
dat[dat['count'] >= 5]['class']
sql
select
id, visit_date, people
from
(select
id, visit_date, people,
count(*) over (partition by offset) cnt
from
(select
id, visit_date, people,
(row_number() over (order by id) - id) offset
from stadium
where people >= 100
) R--get consecutive id
) R1
where cnt >= 3
order by id
python
dat = stadium[stadium['people'] >= 100]
dat['rn'] = dat.rank(method = 'first')
dat['offset'] = dat['rn'] - dat['id']
dat1 = dat.groupby(['offset']).count().to_frame('count').reset_index
data = pd.merge(dat, dat1, how = 'left', left_on = 'offset', right_on = 'offset')
data[data['count'] >= 3].sort_values(by=[‘id’])
sql
select accepter_id as id,sum(num) as num
from
((select accepter_id, count(*) as num
from request_accepted
group by accepter_id)
union all
(select requester_id, count(*) as num
from request_accepted
group by requester_id)) a
group by accepter_id
order by sum(num) desc
limit 1
python
dat1 = request_accepted.groupby(['accepter_id'])['accepter_id'].count().to_frame('num').reset_index()
dat2 = request_accepted.groupby(['requester_id'])['requester_id'].count().to_frame('num').reset_index()
dat = pd.concat([dat1,dat2.rename(columns={'requester_id':'accepter_id'})])
dat = dat.groupby(['accepter_id'])['num'].sum().to_frame('num').reset_index()
dat = dat.sort_values(by = ['num'],ascending = 0)
head(dat,1)
sql
select id, (case when p_id is null then 'Root'
when id in (select p_id from tree) then 'Inner'
else 'Leaf' end) as Type
from tree
python
tree.loc[tree['p_id'].isnull(),'type'] = 'Root'
tree.loc[(tree.id.isin(tree['p_id'])) & (tree['p_id'].notnull()),'type'] = 'Inner'
tree.loc[tree['type'].isnull(),'type'] = 'Leaf'
sql
select round(min(sqrt(POWER((p1.x-p2.x),2) + POWER((p1.y-p2.y),2))),2) as shortest
from point_2d p1 join point_2d p2 on (p1.x,p1.y) <> (p2.x, p2.y)
sql
select c.pay_month,c.department_id,
case
when c.salary_d>d.salary_c then 'higher'
when c.salary_d<d.salary_c then 'lower'
else 'same'
end 'comparison'
from
(
select date_format(a.pay_date,'%Y-%m') pay_month,b.department_id,
avg(a.amount) salary_d from salary a
join employee b
on a.employee_id=b.employee_id
group by date_format(a.pay_date,'%Y-%m'),b.department_id
) c
inner join
(
select date_format(a.pay_date,'%Y-%m') pay_month,b.department_id,
avg(a.amount) salary_c from salary a
join employee b
on a.employee_id=b.employee_id
group by date_format(a.pay_date,'%Y-%m')
) d
on c.pay_month=d.pay_month;
python
sql
select max(case when continent = 'America' then name else null end) as America,
max(case when continent = 'Asia' then name else null end) as Asia,
max(case when continent = 'Europe' then name else null end) as Europe
from
(select
name,
continent,
row_number()over(partition by continent order by name) cur_rank
from
student)t
group by cur_rank
python need to correct
student.groupby(['continent'])['name'].str().rank(method = 'frist') # error: 'NoneType' object is not callable
Output = student.pivot(index= 'rank', columns= 'continent', values= 'name')
sql
select max(num) as num
from
(select num
from my_numbers
group by num
having count(num) = 1) a
sql
select *
from cinema
where id % 2 <> 0 and description <> 'boring'
order by rating desc
python
cinema[(cinema['id'] % 2 != 0) & (cinema['description'] != 'boring')].sort_value(by = ['rating'], ascending = 0)
sql
select
if(id%2=0,
id-1,
if(id=(select count(distinct id) from seat),
id,
id+1))
as id,student
from seat
order by id
这道题目实际上很简单 查询id和student
若id是偶数,减1 若id是奇数,加1 问题在于当总数为奇数时,最后一个id应保持不变,加1会导致空出一位。那么我们找到最后一位,让它保持不变就可以了。
作者:fan-lu-5 链接:https://leetcode-cn.com/problems/exchange-seats/solution/jian-dan-yi-dong-xiao-lu-ji-bai-suo-you-by-fan-lu-/ 来源:力扣(LeetCode) 著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
SELECT
(CASE
WHEN MOD(id, 2) != 0 AND counts != id THEN id + 1 --mod is to get remaider
WHEN MOD(id, 2) != 0 AND counts = id THEN id
ELSE id - 1
END) AS id,
student
FROM
seat,
(SELECT
COUNT(*) AS counts
FROM
seat) AS seat_counts
ORDER BY id ASC;
sql
UPDATE salary
SET sex = if(sex='m','f','m')
python
salary.sex[salary.sex == 'm'] = 'f'
salary.sex[salary.sex == 'f'] = 'm'
sql
select t1.customer_id
from (select customer_id, count(distinct product_key) cnt from customer group by customer_id) t1, (select count(*) cnt from product) t2
where t1.cnt = t2.cnt
select c.customer_id
from Product p left join Customer c on p.product_key = c.product_key
where p.product_key = 5
and c.customer_id in (select c.customer_id
from Product p left join Customer c on p.product_key = c.product_key where p.product_key = 6)
order by customer_id
sql
select b.book_id, b.name
from Books b left join Orders o on b.book_id = o.book_id
where b.book_id in (select book_id
from Books
where datediff(month, available_from, '2019-06-23') > 1)
group by b.book_id, b.name
having isnull(sum(case when o.dispatch_date < '2018-06-23' then 0 else quantity end),0) < 10
order by b.book_id
sql
#mysql
select login_date,count(user_id) user_count
from (select user_id, min(activity_date) login_date from Traffic
where activity='login'
group by user_id) t
where datediff('2019-06-30',login_date)<=90
group by login_date;
#sql server
SELECT
login_date,
COUNT(DISTINCT user_id) user_count
FROM
(SELECT
user_id,
MIN(activity_date) OVER(PARTITION BY user_id) login_date
FROM Traffic
WHERE activity = 'login') t
WHERE datediff(day,login_date,'2019-06-30') <= 90
GROUP BY login_date
ORDER BY login_date
select activity_date as login_date, count(distinct user_id) as user_count
from (select *,
row_number() over (partition by user_id order by activity_date) as rn
from (select * from Traffic where activity = 'login')a) A
where rn = 1 and datediff(day,activity_date,'2019-06-30') <= 90
group by activity_date
we can't use row_number to select first user activity here since there are some users' first activity are not log in. wrong answer:
select activity_date as login_date, count(distinct user_id) as user_count
from (select *,
row_number() over (partition by user_id order by activity_date) as rn
from Traffic) A
where rn = 1 and datediff(day,activity_date,'2019-06-30') <= 90
group by activity_date
sql
#window function
select student_id, course_id, grade
from (select *,
rank() over (partition by student_id order by grade desc, course_id) as rn
from Enrollments) S
where rn = 1
order by student_id
#sql
SELECT student_id, MIN(course_id) AS course_id, grade
FROM Enrollments
WHERE (student_id, grade) IN (SELECT student_id, MAX(grade)
FROM Enrollments
GROUP BY student_id)
GROUP BY student_id
ORDER BY student_id
python
Enrollments['rn'] = Enrollments.groupby(['student_id'])['grade'].rank(method = 'dense', ascending = 0)
indice = Enrollments[Enrollments['rn'] == 1].groupby(['student_id'])['course_id'].idxmin # cannot work
#after grouping to minimum value in pandas, how to display the matching row result entirely along min() value
https://datascience.stackexchange.com/questions/26308/after-grouping-to-minimum-value-in-pandas-how-to-display-the-matching-row-resul
need to figure out why
sql
#v1
select business_id
from(select e1.business_id, e1.event_type, (case when e1.occurences > e2.avg then 1 else 0 end) as count
from Events e1 left join (select event_type, avg(occurences) as avg from Events group by event_type) e2
on e1.event_type = e2.event_type) e
group by business_id
having sum(count) > 1
#v2
select business_id
from events e,
(select event_type,avg(occurences) avg_occ
from events
group by event_type) temp
where e.event_type = temp.event_type and e.occurences > temp.avg_occ
group by e.business_id
having count(*) > 1
#v3
select business_id
from Events e left join (
select event_type, avg(occurences) as tavg
from Events
group by event_type
)t on e.event_type = t.event_type
group by business_id
having sum(case when e.occurences > t.tavg then 1 else 0 end) >1
python
sql
select t2.spend_date, t2.platform,
ifnull(sum(amount),0) total_amount, ifnull(count(user_id),0) total_users
from
(select distinct spend_date, "desktop" as platform from Spending
union
select distinct spend_date, "mobile" as platform from Spending
union
select distinct spend_date, "both" as platform from Spending
) t2
left join
(select spend_date, sum(amount) amount, user_id,
case when count(*) = 1 then platform else "both" end as platform
from Spending
group by spend_date, user_id) t1
on t1.spend_date = t2.spend_date
and t1.platform = t2. platform
group by t2.spend_date, t2.platform
python
spending[‘mobile_spend’] = spending[spending.channel == ‘mobile’].spend
spending[‘desktop_spend’] = spending[spending.channel == ‘desktop’].spend
member_spend = spending.group_by([‘date’, ‘member_id’]).sum([‘mobile_spend’, ‘desktop_spend’]).to_frame([‘mobile_spend’, ‘desktop_spend’]).reset_index()
member_spend.loc[(member_spend.mobile_spend>0) & (member_spend.desktop_spend==0), ‘channel’] = ‘mobile’
member_spend.loc[member_spend.mobile_spend==0 & member_spend.desktop_spend>0, ‘channel’] = ‘desktop’
member_spend.loc[member_spend.mobile_spend>0 & member_spend.desktop_spend>0, ‘channel’] = ‘both’
tot_members = member_spend.groupby([‘date’, ‘channel’]).size().to_frame(‘tot_members’).reset_index()
tot_spend = member_spend.groupby([‘date’, ‘channel’].agg({‘mobile_spend’:sum, ‘desktop_spend’:sum}).to_frame([‘mobile_spend’, ‘desktop_spend’])
tot_spend[‘tot_spend’] = tot_spend[‘mobile_spend’] + tot_spend[‘desktop_spend’]
output = tot_members.concat(tot_spend[‘tot_spend’])
sql
# count will not count null values
select round(100*avg(percent),2) as average_daily_percent
from (select a.action_date, count(distinct(r.post_id))/count(distinct(a.post_id)) as percent
from Removals r right join Actions a on r.post_id = a.post_id
where extra = 'spam'
group by a.action_date) P
#the version blow is wrong since there are duplicate post_id in one day. If I count with case when, I cannot count distinct post_id.
select round(100*avg(percent),2) as average_daily_percent
from (select a.action_date, sum(case when r.post_id is null then 0 else 1 end)/count(distinct a.post_id) as percent
from Removals r right join Actions a on r.post_id = a.post_id
where extra = 'spam'
group by a.action_date) P
python
#count distinct with group by: two solutions
df.groupby("date").agg({"duration": np.sum, "user_id": pd.Series.nunique})
df.groupby("date").agg({"duration": np.sum, "user_id": lambda x: x.nunique()})
sql
# 一定注意日期差是29天 <30.........包含2019-07-27本天
select activity_date day, count(distinct user_id) active_users
from activity
where datediff('2019-07-27', activity_date) < 30
group by activity_date
select activity_date day, count(distinct user_id) active_users
from activity
where activity_date > date_sub('2019-07-27', interval 30 day)
and activity_date <= '2019-07-27'
group by activity_date
python
def daysdiff(d1, d2):
d1 = datetime.strptime(d1, "%Y-%m-%d")
d2 = datetime.strptime(d2, "%Y-%m-%d")
return (d1 - d2).days
import datetime as dt
activity['activity_date']] = pd.to_datetime(activity['activity_date'])
activity['day'] = (dt.strptime('2019-07-27', "%Y-%m-%d") - activity['activity_date']).dt.days
dat = activity[activity['day'] < 30]
dat.groupby(['activity_date'])['user_id'].nunique().to_frame('active_users').reset_index()
sql
# 为空的时候一定要显示一个0,注意一下。
select ifnull(round(count(distinct session_id)/count(distinct user_id),2), 0) average_sessions_per_user
from activity
where datediff('2019-07-27', activity_date) < 30
#v2
select ifnull(round(sum(t.amount)/count(user_id),2), 0) average_sessions_per_user
from
(select user_id, count(distinct session_id) amount
from activity
where datediff('2019-07-27', activity_date) < 30
group by user_id) t
oracle sql
#v1: out of time
select distinct viewer_id as id
from Views
group by viewer_id, view_date
having count(distinct article_id) > 1
order by viewer_id
python
dat = Views.groupby(['viewer_id','view_date']).nunique().to_frame('count').reset_index()
dat[dat['count'] > 1]['viewer_id'].drop_duplicates().sort_values(by = 'viewer_id')
sql
select u.user_id as buyer_id, u.join_date, count(distinct order_id) as orders_in_2019
from Users u left join Orders o on u.user_id = o.buyer_id and year(o.order_date) =2019
group by u.user_id
#v2
select user_id buyer_id, join_date,
sum(if(year(order_date)='2019',1,0)) orders_in_2019
from users left join orders
on user_id = buyer_id
group by user_id
df = pd.merge(Users,Orders,left_on = ['user_id'], right_on = ['buyer_id'], how = 'left')
df['date'] = pd.to_datetime(df['order_date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
dat = df[df['year'] == 2019 ]
dat.groupby(['user_id','join_date'])['order_id'].nunique().to_frame('orders_in_2019').reset_index()
error message I met for this query:
select o.seller_id, (case when u.favorite_brand = i.item_brand then 'yes' else 'no' end) as '2nd_item_fav_brand'
from
(select u.user_id, o.seller_id, u.favorite_brand, i.item_brand,
row_number() over (partition by seller_id order by order_date) as rn
from Users u left join Orders o on u.user_id = o.seller_id join Items i
on o.item_id = i.item_id) s
where rn = 2
error message:[SQL Server]The multi-part identifier "o.seller_id" could not be bound. (4104) reasons: https://stackoverflow.com/questions/7314134/the-multi-part-identifier-could-not-be-bound
sql
SELECT
u.user_id seller_id,
(case when u.favorite_brand = i.item_brand
then 'yes'
else 'no'
end) '2nd_item_fav_brand'
FROM
Users u left join
(SELECT seller_id,item_id,row_number() over (partition by seller_id order by order_date)
as ranking FROM Orders) t
ON
u.user_id = t.seller_id and ranking =2
LEFT JOIN Items i
ON
t.item_id = i.item_id
#烦死了这道题,又简单又出错,气死,晚点再看
python
Orders['rn'] = Orders.groupby(['seller_id'])['order_date'].rank(method = 'first')
dat = pd.merge[Users, Orders, left_on = 'user_id',right_on = 'seller_id', how = 'left']
dat = dat[dat['rn'] == 2]
dat1 = pd.merge[dat, Items, left_on = 'item_id', right_on = 'item_id', how = 'left']
#case when
dat1.loc[(dat1.favorite_brand == dat1.item_brand),'2nd_item_fav_brand'] = 'Yes'
dat1.loc[(dat1.favorite_brand != dat1.item_brand),'2nd_item_fav_brand'] = 'No'
dat1[['seller_id','2nd_item_fav_brand' ]]
sql
(select product_id, new_price as price
from (select *,
row_number() over(partition by product_id order by datediff(day,change_date,'2019-08-16')) as rn
from Products
where datediff(day,change_date,'2019-08-16')>= 0) s
where rn = 1)
union
(select product_id as product_id,10 as price
from Products
group by product_id
having min(change_date)>'2019-08-16')
#v2:
(select product_id as product_id,10 as price
from Products
group by product_id
having min(change_date)>'2019-08-16')
union
(select product_id,new_price as price
from Products
where (product_id,change_date) in
(select product_id,max(change_date) as max_date
from Products
where change_date<='2019-08-16'
group by product_id))
python
Products['change_date'] = Products['change_date'].apply(lambda x: datetime.strptime(x, '%Y%m%d'))
def daysdiff(d1, d2):
d1 = datetime.strptime(d1, "%Y-%m-%d")
d2 = datetime.strptime(d2, "%Y-%m-%d")
return (d1 - d2).days
import datetime as dt
Products['day'] = (dt.strptime('2019-08-16', "%Y-%m-%d") - Products['change_date']).dt.days
Products1 = Products[Products['day'] >= 0]
Products1['rn'] = Products1.groupby(['product_id'])['day'].rank(method = first)
dat1 = Products1[Products1['rn'] == 1][['product_id', 'new_price']]
dat2 = Products.groupby(['product_id'])['change_date'].min().to_frame('min').reset_index()
dat2 = dat2[dat2['min'] >'2019-08-16'][product_id] #should change to date
dat2['price'] = 10
pd.concat(dat1,dat2)
sql
#v1:
select round(100*sum(case when order_date=customer_pref_delivery_date then 1 else 0 end)/sum(1),2)immediate_percentage
from Delivery
where (customer_id,order_date) in
(select customer_id,min(order_date) mindate
from Delivery
group by customer_id)
sql
v1: mysql
select DATE_FORMAT(trans_date,"%Y-%m") as month, country, count(*) as trans_count,
sum(case when state = 'approved' then 1 else 0 end) as approved_count,
sum(amount) as trans_total_amount,
sum(case when state = 'approved' then amount else 0 end) as approved_total_amount
from Transactions
group by country, DATE_FORMAT(trans_date,"%Y-%m")
python